Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +29 -0
model.safetensors +3 -0
special_tokens_map.json +25 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +40 -0
trainer_state.json +617 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_name_or_path": "princeton-nlp/Sheared-LLaMA-1.3B",
+  "architectures": [
+    "LlamaBiModel"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "initializer_range": 0.02,
+  "intermediate_size": 5504,
+  "max_position_embeddings": 4096,
+  "model_type": "llama",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 16,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06b5579fd0d157162fbf371573b4a415c23bb5eec858b575220112927e9ada76
+size 2559798552

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": "_",
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "mask_token": "_",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,617 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.32923775694404,
+  "eval_steps": 10000,
+  "global_step": 40000,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0291154719618005,
+      "grad_norm": 5.6875,
+      "learning_rate": 4.9514742133969994e-05,
+      "loss": 2.4005,
+      "step": 500
+    },
+    {
+      "epoch": 0.058230943923601,
+      "grad_norm": 5.125,
+      "learning_rate": 4.9029484267939986e-05,
+      "loss": 1.8762,
+      "step": 1000
+    },
+    {
+      "epoch": 0.0873464158854015,
+      "grad_norm": 5.21875,
+      "learning_rate": 4.854422640190998e-05,
+      "loss": 1.7508,
+      "step": 1500
+    },
+    {
+      "epoch": 0.116461887847202,
+      "grad_norm": 5.125,
+      "learning_rate": 4.805896853587997e-05,
+      "loss": 1.6692,
+      "step": 2000
+    },
+    {
+      "epoch": 0.1455773598090025,
+      "grad_norm": 5.03125,
+      "learning_rate": 4.757371066984996e-05,
+      "loss": 1.6309,
+      "step": 2500
+    },
+    {
+      "epoch": 0.174692831770803,
+      "grad_norm": 4.6875,
+      "learning_rate": 4.708845280381995e-05,
+      "loss": 1.5878,
+      "step": 3000
+    },
+    {
+      "epoch": 0.2038083037326035,
+      "grad_norm": 4.90625,
+      "learning_rate": 4.6603194937789944e-05,
+      "loss": 1.5593,
+      "step": 3500
+    },
+    {
+      "epoch": 0.232923775694404,
+      "grad_norm": 4.65625,
+      "learning_rate": 4.6117937071759936e-05,
+      "loss": 1.5287,
+      "step": 4000
+    },
+    {
+      "epoch": 0.2620392476562045,
+      "grad_norm": 5.53125,
+      "learning_rate": 4.563267920572993e-05,
+      "loss": 1.5189,
+      "step": 4500
+    },
+    {
+      "epoch": 0.291154719618005,
+      "grad_norm": 4.53125,
+      "learning_rate": 4.514742133969992e-05,
+      "loss": 1.5018,
+      "step": 5000
+    },
+    {
+      "epoch": 0.3202701915798055,
+      "grad_norm": 4.8125,
+      "learning_rate": 4.466216347366991e-05,
+      "loss": 1.4784,
+      "step": 5500
+    },
+    {
+      "epoch": 0.349385663541606,
+      "grad_norm": 4.6875,
+      "learning_rate": 4.41769056076399e-05,
+      "loss": 1.4703,
+      "step": 6000
+    },
+    {
+      "epoch": 0.3785011355034065,
+      "grad_norm": 4.5,
+      "learning_rate": 4.3691647741609894e-05,
+      "loss": 1.449,
+      "step": 6500
+    },
+    {
+      "epoch": 0.407616607465207,
+      "grad_norm": 4.71875,
+      "learning_rate": 4.3206389875579886e-05,
+      "loss": 1.4496,
+      "step": 7000
+    },
+    {
+      "epoch": 0.43673207942700754,
+      "grad_norm": 4.65625,
+      "learning_rate": 4.272113200954988e-05,
+      "loss": 1.4421,
+      "step": 7500
+    },
+    {
+      "epoch": 0.465847551388808,
+      "grad_norm": 4.90625,
+      "learning_rate": 4.223587414351987e-05,
+      "loss": 1.4305,
+      "step": 8000
+    },
+    {
+      "epoch": 0.4949630233506085,
+      "grad_norm": 4.25,
+      "learning_rate": 4.175061627748986e-05,
+      "loss": 1.4251,
+      "step": 8500
+    },
+    {
+      "epoch": 0.524078495312409,
+      "grad_norm": 4.53125,
+      "learning_rate": 4.126535841145985e-05,
+      "loss": 1.4116,
+      "step": 9000
+    },
+    {
+      "epoch": 0.5531939672742096,
+      "grad_norm": 4.375,
+      "learning_rate": 4.0780100545429844e-05,
+      "loss": 1.4124,
+      "step": 9500
+    },
+    {
+      "epoch": 0.58230943923601,
+      "grad_norm": 5.40625,
+      "learning_rate": 4.0294842679399835e-05,
+      "loss": 1.4035,
+      "step": 10000
+    },
+    {
+      "epoch": 0.58230943923601,
+      "eval_accuracy": 0.7037869026306487,
+      "eval_loss": 1.3742600679397583,
+      "eval_runtime": 6.4605,
+      "eval_samples_per_second": 89.931,
+      "eval_steps_per_second": 2.941,
+      "step": 10000
+    },
+    {
+      "epoch": 0.6114249111978105,
+      "grad_norm": 4.40625,
+      "learning_rate": 3.980958481336983e-05,
+      "loss": 1.4042,
+      "step": 10500
+    },
+    {
+      "epoch": 0.640540383159611,
+      "grad_norm": 4.96875,
+      "learning_rate": 3.932432694733982e-05,
+      "loss": 1.3966,
+      "step": 11000
+    },
+    {
+      "epoch": 0.6696558551214116,
+      "grad_norm": 4.5625,
+      "learning_rate": 3.883906908130981e-05,
+      "loss": 1.3887,
+      "step": 11500
+    },
+    {
+      "epoch": 0.698771327083212,
+      "grad_norm": 5.0,
+      "learning_rate": 3.83538112152798e-05,
+      "loss": 1.3843,
+      "step": 12000
+    },
+    {
+      "epoch": 0.7278867990450125,
+      "grad_norm": 4.65625,
+      "learning_rate": 3.7868553349249794e-05,
+      "loss": 1.3783,
+      "step": 12500
+    },
+    {
+      "epoch": 0.757002271006813,
+      "grad_norm": 4.6875,
+      "learning_rate": 3.7383295483219785e-05,
+      "loss": 1.375,
+      "step": 13000
+    },
+    {
+      "epoch": 0.7861177429686135,
+      "grad_norm": 4.46875,
+      "learning_rate": 3.689803761718978e-05,
+      "loss": 1.3756,
+      "step": 13500
+    },
+    {
+      "epoch": 0.815233214930414,
+      "grad_norm": 4.65625,
+      "learning_rate": 3.641277975115977e-05,
+      "loss": 1.3636,
+      "step": 14000
+    },
+    {
+      "epoch": 0.8443486868922145,
+      "grad_norm": 4.5625,
+      "learning_rate": 3.592752188512976e-05,
+      "loss": 1.3688,
+      "step": 14500
+    },
+    {
+      "epoch": 0.8734641588540151,
+      "grad_norm": 4.59375,
+      "learning_rate": 3.544226401909975e-05,
+      "loss": 1.3652,
+      "step": 15000
+    },
+    {
+      "epoch": 0.9025796308158155,
+      "grad_norm": 4.90625,
+      "learning_rate": 3.4957006153069744e-05,
+      "loss": 1.3611,
+      "step": 15500
+    },
+    {
+      "epoch": 0.931695102777616,
+      "grad_norm": 5.09375,
+      "learning_rate": 3.4471748287039735e-05,
+      "loss": 1.352,
+      "step": 16000
+    },
+    {
+      "epoch": 0.9608105747394166,
+      "grad_norm": 4.625,
+      "learning_rate": 3.398649042100973e-05,
+      "loss": 1.3562,
+      "step": 16500
+    },
+    {
+      "epoch": 0.989926046701217,
+      "grad_norm": 4.84375,
+      "learning_rate": 3.350123255497972e-05,
+      "loss": 1.3491,
+      "step": 17000
+    },
+    {
+      "epoch": 1.0190415186630175,
+      "grad_norm": 5.15625,
+      "learning_rate": 3.301597468894971e-05,
+      "loss": 1.3448,
+      "step": 17500
+    },
+    {
+      "epoch": 1.048156990624818,
+      "grad_norm": 4.625,
+      "learning_rate": 3.25307168229197e-05,
+      "loss": 1.344,
+      "step": 18000
+    },
+    {
+      "epoch": 1.0772724625866186,
+      "grad_norm": 4.75,
+      "learning_rate": 3.204545895688969e-05,
+      "loss": 1.3381,
+      "step": 18500
+    },
+    {
+      "epoch": 1.106387934548419,
+      "grad_norm": 4.8125,
+      "learning_rate": 3.1560201090859685e-05,
+      "loss": 1.34,
+      "step": 19000
+    },
+    {
+      "epoch": 1.1355034065102196,
+      "grad_norm": 5.375,
+      "learning_rate": 3.107494322482968e-05,
+      "loss": 1.3352,
+      "step": 19500
+    },
+    {
+      "epoch": 1.16461887847202,
+      "grad_norm": 5.71875,
+      "learning_rate": 3.058968535879967e-05,
+      "loss": 1.3374,
+      "step": 20000
+    },
+    {
+      "epoch": 1.16461887847202,
+      "eval_accuracy": 0.7118764765058947,
+      "eval_loss": 1.3248625993728638,
+      "eval_runtime": 6.4569,
+      "eval_samples_per_second": 89.981,
+      "eval_steps_per_second": 2.943,
+      "step": 20000
+    },
+    {
+      "epoch": 1.1937343504338205,
+      "grad_norm": 4.5,
+      "learning_rate": 3.010442749276966e-05,
+      "loss": 1.3314,
+      "step": 20500
+    },
+    {
+      "epoch": 1.222849822395621,
+      "grad_norm": 5.09375,
+      "learning_rate": 2.9619169626739652e-05,
+      "loss": 1.3296,
+      "step": 21000
+    },
+    {
+      "epoch": 1.2519652943574215,
+      "grad_norm": 4.40625,
+      "learning_rate": 2.9133911760709644e-05,
+      "loss": 1.3331,
+      "step": 21500
+    },
+    {
+      "epoch": 1.281080766319222,
+      "grad_norm": 4.75,
+      "learning_rate": 2.8648653894679632e-05,
+      "loss": 1.332,
+      "step": 22000
+    },
+    {
+      "epoch": 1.3101962382810226,
+      "grad_norm": 5.09375,
+      "learning_rate": 2.8163396028649623e-05,
+      "loss": 1.3291,
+      "step": 22500
+    },
+    {
+      "epoch": 1.3393117102428231,
+      "grad_norm": 4.53125,
+      "learning_rate": 2.7678138162619615e-05,
+      "loss": 1.3301,
+      "step": 23000
+    },
+    {
+      "epoch": 1.3684271822046234,
+      "grad_norm": 4.8125,
+      "learning_rate": 2.7192880296589607e-05,
+      "loss": 1.3266,
+      "step": 23500
+    },
+    {
+      "epoch": 1.397542654166424,
+      "grad_norm": 5.0,
+      "learning_rate": 2.67076224305596e-05,
+      "loss": 1.3178,
+      "step": 24000
+    },
+    {
+      "epoch": 1.4266581261282245,
+      "grad_norm": 5.15625,
+      "learning_rate": 2.6222364564529593e-05,
+      "loss": 1.323,
+      "step": 24500
+    },
+    {
+      "epoch": 1.455773598090025,
+      "grad_norm": 4.78125,
+      "learning_rate": 2.5737106698499585e-05,
+      "loss": 1.3206,
+      "step": 25000
+    },
+    {
+      "epoch": 1.4848890700518256,
+      "grad_norm": 5.0625,
+      "learning_rate": 2.5251848832469577e-05,
+      "loss": 1.319,
+      "step": 25500
+    },
+    {
+      "epoch": 1.5140045420136259,
+      "grad_norm": 5.34375,
+      "learning_rate": 2.476659096643957e-05,
+      "loss": 1.3129,
+      "step": 26000
+    },
+    {
+      "epoch": 1.5431200139754266,
+      "grad_norm": 4.96875,
+      "learning_rate": 2.428133310040956e-05,
+      "loss": 1.3163,
+      "step": 26500
+    },
+    {
+      "epoch": 1.572235485937227,
+      "grad_norm": 4.59375,
+      "learning_rate": 2.3796075234379552e-05,
+      "loss": 1.313,
+      "step": 27000
+    },
+    {
+      "epoch": 1.6013509578990277,
+      "grad_norm": 4.40625,
+      "learning_rate": 2.331081736834954e-05,
+      "loss": 1.3167,
+      "step": 27500
+    },
+    {
+      "epoch": 1.630466429860828,
+      "grad_norm": 5.09375,
+      "learning_rate": 2.282555950231953e-05,
+      "loss": 1.3151,
+      "step": 28000
+    },
+    {
+      "epoch": 1.6595819018226285,
+      "grad_norm": 5.09375,
+      "learning_rate": 2.2340301636289527e-05,
+      "loss": 1.312,
+      "step": 28500
+    },
+    {
+      "epoch": 1.688697373784429,
+      "grad_norm": 4.71875,
+      "learning_rate": 2.185504377025952e-05,
+      "loss": 1.3196,
+      "step": 29000
+    },
+    {
+      "epoch": 1.7178128457462294,
+      "grad_norm": 4.46875,
+      "learning_rate": 2.136978590422951e-05,
+      "loss": 1.3149,
+      "step": 29500
+    },
+    {
+      "epoch": 1.7469283177080301,
+      "grad_norm": 4.71875,
+      "learning_rate": 2.0884528038199498e-05,
+      "loss": 1.3142,
+      "step": 30000
+    },
+    {
+      "epoch": 1.7469283177080301,
+      "eval_accuracy": 0.7175331317769543,
+      "eval_loss": 1.2938237190246582,
+      "eval_runtime": 6.453,
+      "eval_samples_per_second": 90.036,
+      "eval_steps_per_second": 2.944,
+      "step": 30000
+    },
+    {
+      "epoch": 1.7760437896698305,
+      "grad_norm": 5.03125,
+      "learning_rate": 2.039927017216949e-05,
+      "loss": 1.3115,
+      "step": 30500
+    },
+    {
+      "epoch": 1.8051592616316312,
+      "grad_norm": 4.90625,
+      "learning_rate": 1.9914012306139485e-05,
+      "loss": 1.3155,
+      "step": 31000
+    },
+    {
+      "epoch": 1.8342747335934315,
+      "grad_norm": 5.0,
+      "learning_rate": 1.9428754440109477e-05,
+      "loss": 1.3179,
+      "step": 31500
+    },
+    {
+      "epoch": 1.863390205555232,
+      "grad_norm": 5.21875,
+      "learning_rate": 1.8943496574079468e-05,
+      "loss": 1.3091,
+      "step": 32000
+    },
+    {
+      "epoch": 1.8925056775170326,
+      "grad_norm": 4.46875,
+      "learning_rate": 1.8458238708049457e-05,
+      "loss": 1.3148,
+      "step": 32500
+    },
+    {
+      "epoch": 1.921621149478833,
+      "grad_norm": 5.15625,
+      "learning_rate": 1.7972980842019448e-05,
+      "loss": 1.3091,
+      "step": 33000
+    },
+    {
+      "epoch": 1.9507366214406336,
+      "grad_norm": 5.03125,
+      "learning_rate": 1.7487722975989443e-05,
+      "loss": 1.3176,
+      "step": 33500
+    },
+    {
+      "epoch": 1.979852093402434,
+      "grad_norm": 4.625,
+      "learning_rate": 1.7002465109959435e-05,
+      "loss": 1.3135,
+      "step": 34000
+    },
+    {
+      "epoch": 2.0089675653642347,
+      "grad_norm": 5.03125,
+      "learning_rate": 1.6517207243929427e-05,
+      "loss": 1.3134,
+      "step": 34500
+    },
+    {
+      "epoch": 2.038083037326035,
+      "grad_norm": 4.875,
+      "learning_rate": 1.6031949377899415e-05,
+      "loss": 1.3061,
+      "step": 35000
+    },
+    {
+      "epoch": 2.0671985092878353,
+      "grad_norm": 4.84375,
+      "learning_rate": 1.5546691511869406e-05,
+      "loss": 1.3053,
+      "step": 35500
+    },
+    {
+      "epoch": 2.096313981249636,
+      "grad_norm": 4.40625,
+      "learning_rate": 1.5061433645839398e-05,
+      "loss": 1.3102,
+      "step": 36000
+    },
+    {
+      "epoch": 2.1254294532114364,
+      "grad_norm": 4.90625,
+      "learning_rate": 1.4576175779809393e-05,
+      "loss": 1.3137,
+      "step": 36500
+    },
+    {
+      "epoch": 2.154544925173237,
+      "grad_norm": 4.75,
+      "learning_rate": 1.4090917913779383e-05,
+      "loss": 1.306,
+      "step": 37000
+    },
+    {
+      "epoch": 2.1836603971350375,
+      "grad_norm": 4.9375,
+      "learning_rate": 1.3605660047749375e-05,
+      "loss": 1.3029,
+      "step": 37500
+    },
+    {
+      "epoch": 2.212775869096838,
+      "grad_norm": 5.34375,
+      "learning_rate": 1.3120402181719366e-05,
+      "loss": 1.3068,
+      "step": 38000
+    },
+    {
+      "epoch": 2.2418913410586385,
+      "grad_norm": 4.96875,
+      "learning_rate": 1.2635144315689356e-05,
+      "loss": 1.3033,
+      "step": 38500
+    },
+    {
+      "epoch": 2.2710068130204393,
+      "grad_norm": 5.46875,
+      "learning_rate": 1.214988644965935e-05,
+      "loss": 1.3104,
+      "step": 39000
+    },
+    {
+      "epoch": 2.3001222849822396,
+      "grad_norm": 4.71875,
+      "learning_rate": 1.1664628583629341e-05,
+      "loss": 1.3058,
+      "step": 39500
+    },
+    {
+      "epoch": 2.32923775694404,
+      "grad_norm": 4.53125,
+      "learning_rate": 1.1179370717599333e-05,
+      "loss": 1.3064,
+      "step": 40000
+    },
+    {
+      "epoch": 2.32923775694404,
+      "eval_accuracy": 0.7175919626253005,
+      "eval_loss": 1.2843679189682007,
+      "eval_runtime": 12.4622,
+      "eval_samples_per_second": 46.621,
+      "eval_steps_per_second": 1.525,
+      "step": 40000
+    }
+  ],
+  "logging_steps": 500,
+  "max_steps": 51519,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 10000,
+  "total_flos": 2.5163609407488e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b72e646bb40995a7e3ddeabf4857d1aebbd0ce8672eb70c45afd1146be1ff620
+size 5048