PUSH last checkpoint
32624be
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 1408,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007109215320359016,
"grad_norm": 22.657577985866105,
"learning_rate": 9.302325581395349e-06,
"loss": 2.574,
"mean_token_accuracy": 0.5464246176183224,
"num_tokens": 4589382.0,
"step": 5
},
{
"epoch": 0.014218430640718031,
"grad_norm": 2.3543289370923013,
"learning_rate": 2.0930232558139536e-05,
"loss": 1.4882,
"mean_token_accuracy": 0.6589333653450012,
"num_tokens": 9171524.0,
"step": 10
},
{
"epoch": 0.021327645961077047,
"grad_norm": 0.8063547574982903,
"learning_rate": 3.2558139534883724e-05,
"loss": 1.0174,
"mean_token_accuracy": 0.7330243036150932,
"num_tokens": 13765157.0,
"step": 15
},
{
"epoch": 0.028436861281436063,
"grad_norm": 0.572573905518242,
"learning_rate": 4.418604651162791e-05,
"loss": 0.8773,
"mean_token_accuracy": 0.7569610200822353,
"num_tokens": 18369874.0,
"step": 20
},
{
"epoch": 0.035546076601795075,
"grad_norm": 0.5738482260117446,
"learning_rate": 5.5813953488372095e-05,
"loss": 0.7975,
"mean_token_accuracy": 0.7729738861322403,
"num_tokens": 22960290.0,
"step": 25
},
{
"epoch": 0.042655291922154094,
"grad_norm": 0.5016568944917689,
"learning_rate": 6.744186046511628e-05,
"loss": 0.7632,
"mean_token_accuracy": 0.778630904853344,
"num_tokens": 27556623.0,
"step": 30
},
{
"epoch": 0.049764507242513106,
"grad_norm": 0.4845613474361907,
"learning_rate": 7.906976744186047e-05,
"loss": 0.7326,
"mean_token_accuracy": 0.7872321248054505,
"num_tokens": 32158408.0,
"step": 35
},
{
"epoch": 0.056873722562872125,
"grad_norm": 0.4270154516127363,
"learning_rate": 9.069767441860465e-05,
"loss": 0.7095,
"mean_token_accuracy": 0.7919960044324398,
"num_tokens": 36742233.0,
"step": 40
},
{
"epoch": 0.06398293788323114,
"grad_norm": 0.499498695141066,
"learning_rate": 9.9999880816326e-05,
"loss": 0.6973,
"mean_token_accuracy": 0.7952379912137986,
"num_tokens": 41335670.0,
"step": 45
},
{
"epoch": 0.07109215320359015,
"grad_norm": 0.4645180201543763,
"learning_rate": 9.999570945402425e-05,
"loss": 0.6853,
"mean_token_accuracy": 0.7981184311211109,
"num_tokens": 45940079.0,
"step": 50
},
{
"epoch": 0.07820136852394917,
"grad_norm": 0.434255531179794,
"learning_rate": 9.998557953932929e-05,
"loss": 0.6688,
"mean_token_accuracy": 0.8012012615799904,
"num_tokens": 50533771.0,
"step": 55
},
{
"epoch": 0.08531058384430819,
"grad_norm": 0.393754634337621,
"learning_rate": 9.99694924136941e-05,
"loss": 0.6725,
"mean_token_accuracy": 0.800255061686039,
"num_tokens": 55133444.0,
"step": 60
},
{
"epoch": 0.0924197991646672,
"grad_norm": 0.49718727212066355,
"learning_rate": 9.99474502074547e-05,
"loss": 0.6664,
"mean_token_accuracy": 0.801218880712986,
"num_tokens": 59726447.0,
"step": 65
},
{
"epoch": 0.09952901448502621,
"grad_norm": 0.4005142024066312,
"learning_rate": 9.991945583954808e-05,
"loss": 0.6549,
"mean_token_accuracy": 0.8056452445685863,
"num_tokens": 64319917.0,
"step": 70
},
{
"epoch": 0.10663822980538523,
"grad_norm": 0.3774090383980249,
"learning_rate": 9.988551301712567e-05,
"loss": 0.6454,
"mean_token_accuracy": 0.806719920784235,
"num_tokens": 68898868.0,
"step": 75
},
{
"epoch": 0.11374744512574425,
"grad_norm": 0.3995895890256704,
"learning_rate": 9.984562623506235e-05,
"loss": 0.6464,
"mean_token_accuracy": 0.8064703330397606,
"num_tokens": 73481972.0,
"step": 80
},
{
"epoch": 0.12085666044610326,
"grad_norm": 0.3801619159341505,
"learning_rate": 9.979980077536136e-05,
"loss": 0.6462,
"mean_token_accuracy": 0.8080633491277694,
"num_tokens": 78079419.0,
"step": 85
},
{
"epoch": 0.1279658757664623,
"grad_norm": 0.37074794226689833,
"learning_rate": 9.974804270645462e-05,
"loss": 0.6362,
"mean_token_accuracy": 0.8091117829084397,
"num_tokens": 82670195.0,
"step": 90
},
{
"epoch": 0.13507509108682128,
"grad_norm": 0.37193721608812236,
"learning_rate": 9.969035888239937e-05,
"loss": 0.635,
"mean_token_accuracy": 0.8079991653561592,
"num_tokens": 87257953.0,
"step": 95
},
{
"epoch": 0.1421843064071803,
"grad_norm": 0.36251703620037773,
"learning_rate": 9.96267569419703e-05,
"loss": 0.6315,
"mean_token_accuracy": 0.8096475720405578,
"num_tokens": 91838382.0,
"step": 100
},
{
"epoch": 0.1421843064071803,
"eval_loss": 0.5971412062644958,
"eval_mean_token_accuracy": 0.8093206621052926,
"eval_num_tokens": 91838382.0,
"eval_runtime": 141.8153,
"eval_samples_per_second": 25.653,
"eval_steps_per_second": 0.804,
"step": 100
},
{
"epoch": 0.14929352172753932,
"grad_norm": 0.41583625971563776,
"learning_rate": 9.955724530764809e-05,
"loss": 0.6381,
"mean_token_accuracy": 0.8077230393886566,
"num_tokens": 96431755.0,
"step": 105
},
{
"epoch": 0.15640273704789834,
"grad_norm": 0.3705693803444073,
"learning_rate": 9.948183318450413e-05,
"loss": 0.6197,
"mean_token_accuracy": 0.8116156131029129,
"num_tokens": 101027690.0,
"step": 110
},
{
"epoch": 0.16351195236825736,
"grad_norm": 0.3214510651452395,
"learning_rate": 9.940053055898133e-05,
"loss": 0.6313,
"mean_token_accuracy": 0.8089181430637836,
"num_tokens": 105628547.0,
"step": 115
},
{
"epoch": 0.17062116768861638,
"grad_norm": 0.34220731720085373,
"learning_rate": 9.93133481975719e-05,
"loss": 0.6077,
"mean_token_accuracy": 0.814984206855297,
"num_tokens": 110243592.0,
"step": 120
},
{
"epoch": 0.1777303830089754,
"grad_norm": 0.35675802487560043,
"learning_rate": 9.922029764539148e-05,
"loss": 0.6263,
"mean_token_accuracy": 0.8096928559243679,
"num_tokens": 114832845.0,
"step": 125
},
{
"epoch": 0.1848395983293344,
"grad_norm": 0.3422296936678833,
"learning_rate": 9.912139122465027e-05,
"loss": 0.6116,
"mean_token_accuracy": 0.8140982151031494,
"num_tokens": 119435421.0,
"step": 130
},
{
"epoch": 0.1919488136496934,
"grad_norm": 0.3599918244922273,
"learning_rate": 9.901664203302126e-05,
"loss": 0.6052,
"mean_token_accuracy": 0.8154805108904839,
"num_tokens": 124028647.0,
"step": 135
},
{
"epoch": 0.19905802897005243,
"grad_norm": 0.3595154303423279,
"learning_rate": 9.890606394190588e-05,
"loss": 0.6126,
"mean_token_accuracy": 0.8132404424250126,
"num_tokens": 128628413.0,
"step": 140
},
{
"epoch": 0.20616724429041144,
"grad_norm": 0.3711012466200944,
"learning_rate": 9.878967159459693e-05,
"loss": 0.6068,
"mean_token_accuracy": 0.8164977565407753,
"num_tokens": 133219422.0,
"step": 145
},
{
"epoch": 0.21327645961077046,
"grad_norm": 0.35910926284617867,
"learning_rate": 9.866748040433956e-05,
"loss": 0.6099,
"mean_token_accuracy": 0.8152773261070252,
"num_tokens": 137825952.0,
"step": 150
},
{
"epoch": 0.22038567493112948,
"grad_norm": 0.4205439208166243,
"learning_rate": 9.853950655229009e-05,
"loss": 0.6064,
"mean_token_accuracy": 0.815191026777029,
"num_tokens": 142422368.0,
"step": 155
},
{
"epoch": 0.2274948902514885,
"grad_norm": 0.32091150374802263,
"learning_rate": 9.840576698537329e-05,
"loss": 0.6093,
"mean_token_accuracy": 0.8135301224887371,
"num_tokens": 147015990.0,
"step": 160
},
{
"epoch": 0.23460410557184752,
"grad_norm": 0.32627028158119226,
"learning_rate": 9.826627941403811e-05,
"loss": 0.5969,
"mean_token_accuracy": 0.8182829037308693,
"num_tokens": 151627096.0,
"step": 165
},
{
"epoch": 0.2417133208922065,
"grad_norm": 0.32405674248273,
"learning_rate": 9.812106230991248e-05,
"loss": 0.6068,
"mean_token_accuracy": 0.8159149341285229,
"num_tokens": 156218968.0,
"step": 170
},
{
"epoch": 0.24882253621256553,
"grad_norm": 0.3206982540127891,
"learning_rate": 9.79701349033571e-05,
"loss": 0.6039,
"mean_token_accuracy": 0.8161494679749012,
"num_tokens": 160797401.0,
"step": 175
},
{
"epoch": 0.2559317515329246,
"grad_norm": 0.3360732448004463,
"learning_rate": 9.78135171809189e-05,
"loss": 0.6068,
"mean_token_accuracy": 0.8159954428672791,
"num_tokens": 165402684.0,
"step": 180
},
{
"epoch": 0.26304096685328354,
"grad_norm": 0.33789233259366286,
"learning_rate": 9.76512298826844e-05,
"loss": 0.6026,
"mean_token_accuracy": 0.8167447924613953,
"num_tokens": 169997282.0,
"step": 185
},
{
"epoch": 0.27015018217364256,
"grad_norm": 0.3089560668153988,
"learning_rate": 9.748329449953302e-05,
"loss": 0.5904,
"mean_token_accuracy": 0.8193566597998142,
"num_tokens": 174589836.0,
"step": 190
},
{
"epoch": 0.2772593974940016,
"grad_norm": 0.32060053414915524,
"learning_rate": 9.73097332702914e-05,
"loss": 0.6044,
"mean_token_accuracy": 0.8175870932638645,
"num_tokens": 179181747.0,
"step": 195
},
{
"epoch": 0.2843686128143606,
"grad_norm": 0.32004664048912745,
"learning_rate": 9.713056917878818e-05,
"loss": 0.5888,
"mean_token_accuracy": 0.8192018747329712,
"num_tokens": 183760367.0,
"step": 200
},
{
"epoch": 0.2843686128143606,
"eval_loss": 0.5599971413612366,
"eval_mean_token_accuracy": 0.8188303473748659,
"eval_num_tokens": 183760367.0,
"eval_runtime": 145.8536,
"eval_samples_per_second": 24.943,
"eval_steps_per_second": 0.782,
"step": 200
},
{
"epoch": 0.2914778281347196,
"grad_norm": 0.3094551492752116,
"learning_rate": 9.694582595081057e-05,
"loss": 0.5872,
"mean_token_accuracy": 0.819921114295721,
"num_tokens": 188360903.0,
"step": 205
},
{
"epoch": 0.29858704345507864,
"grad_norm": 0.36254147904822126,
"learning_rate": 9.67555280509623e-05,
"loss": 0.5942,
"mean_token_accuracy": 0.817745155096054,
"num_tokens": 192932381.0,
"step": 210
},
{
"epoch": 0.30569625877543766,
"grad_norm": 0.3377909564779145,
"learning_rate": 9.655970067942405e-05,
"loss": 0.5994,
"mean_token_accuracy": 0.8163805276155471,
"num_tokens": 197505985.0,
"step": 215
},
{
"epoch": 0.3128054740957967,
"grad_norm": 0.30751780494672465,
"learning_rate": 9.63583697686162e-05,
"loss": 0.5902,
"mean_token_accuracy": 0.8196643941104412,
"num_tokens": 202105424.0,
"step": 220
},
{
"epoch": 0.3199146894161557,
"grad_norm": 0.34345028355301316,
"learning_rate": 9.615156197976477e-05,
"loss": 0.582,
"mean_token_accuracy": 0.8217154465615749,
"num_tokens": 206686951.0,
"step": 225
},
{
"epoch": 0.3270239047365147,
"grad_norm": 0.3216135018716631,
"learning_rate": 9.593930469937087e-05,
"loss": 0.5708,
"mean_token_accuracy": 0.8250658005475998,
"num_tokens": 211278788.0,
"step": 230
},
{
"epoch": 0.33413312005687373,
"grad_norm": 0.32564659909940696,
"learning_rate": 9.572162603558393e-05,
"loss": 0.5928,
"mean_token_accuracy": 0.819525595754385,
"num_tokens": 215877205.0,
"step": 235
},
{
"epoch": 0.34124233537723275,
"grad_norm": 0.4839583335140069,
"learning_rate": 9.549855481447954e-05,
"loss": 0.5882,
"mean_token_accuracy": 0.8204580388963223,
"num_tokens": 220486454.0,
"step": 240
},
{
"epoch": 0.34835155069759177,
"grad_norm": 0.3268671171699921,
"learning_rate": 9.527012057624224e-05,
"loss": 0.5836,
"mean_token_accuracy": 0.8208626843988895,
"num_tokens": 225080225.0,
"step": 245
},
{
"epoch": 0.3554607660179508,
"grad_norm": 0.3244498327733708,
"learning_rate": 9.50363535712535e-05,
"loss": 0.586,
"mean_token_accuracy": 0.8207595020532608,
"num_tokens": 229657012.0,
"step": 250
},
{
"epoch": 0.3625699813383098,
"grad_norm": 0.29889265357291406,
"learning_rate": 9.479728475608593e-05,
"loss": 0.5919,
"mean_token_accuracy": 0.8190862230956555,
"num_tokens": 234248976.0,
"step": 255
},
{
"epoch": 0.3696791966586688,
"grad_norm": 0.34636883393423384,
"learning_rate": 9.455294578940384e-05,
"loss": 0.5765,
"mean_token_accuracy": 0.8226364821195602,
"num_tokens": 238829734.0,
"step": 260
},
{
"epoch": 0.3767884119790278,
"grad_norm": 0.3092592234408446,
"learning_rate": 9.430336902777083e-05,
"loss": 0.576,
"mean_token_accuracy": 0.821333235502243,
"num_tokens": 243418989.0,
"step": 265
},
{
"epoch": 0.3838976272993868,
"grad_norm": 0.30454136223380207,
"learning_rate": 9.404858752136499e-05,
"loss": 0.5771,
"mean_token_accuracy": 0.8237294301390647,
"num_tokens": 248015701.0,
"step": 270
},
{
"epoch": 0.39100684261974583,
"grad_norm": 0.30289215095264577,
"learning_rate": 9.378863500960222e-05,
"loss": 0.5709,
"mean_token_accuracy": 0.8236084163188935,
"num_tokens": 252613191.0,
"step": 275
},
{
"epoch": 0.39811605794010485,
"grad_norm": 0.3010273864601919,
"learning_rate": 9.352354591666827e-05,
"loss": 0.5861,
"mean_token_accuracy": 0.820894256979227,
"num_tokens": 257210808.0,
"step": 280
},
{
"epoch": 0.40522527326046387,
"grad_norm": 0.30175911100812025,
"learning_rate": 9.325335534696017e-05,
"loss": 0.5753,
"mean_token_accuracy": 0.8225005254149437,
"num_tokens": 261790131.0,
"step": 285
},
{
"epoch": 0.4123344885808229,
"grad_norm": 0.28871941798325856,
"learning_rate": 9.29780990804375e-05,
"loss": 0.5799,
"mean_token_accuracy": 0.821347926557064,
"num_tokens": 266377324.0,
"step": 290
},
{
"epoch": 0.4194437039011819,
"grad_norm": 0.28095014086273895,
"learning_rate": 9.269781356788424e-05,
"loss": 0.581,
"mean_token_accuracy": 0.8209108576178551,
"num_tokens": 270967910.0,
"step": 295
},
{
"epoch": 0.4265529192215409,
"grad_norm": 0.2893211807696515,
"learning_rate": 9.241253592608183e-05,
"loss": 0.5755,
"mean_token_accuracy": 0.8242007777094841,
"num_tokens": 275570273.0,
"step": 300
},
{
"epoch": 0.4265529192215409,
"eval_loss": 0.5416839122772217,
"eval_mean_token_accuracy": 0.8231211885025627,
"eval_num_tokens": 275570273.0,
"eval_runtime": 145.5254,
"eval_samples_per_second": 24.999,
"eval_steps_per_second": 0.783,
"step": 300
},
{
"epoch": 0.43366213454189995,
"grad_norm": 0.30733885282429685,
"learning_rate": 9.212230393289385e-05,
"loss": 0.5781,
"mean_token_accuracy": 0.8230207331478596,
"num_tokens": 280172533.0,
"step": 305
},
{
"epoch": 0.44077134986225897,
"grad_norm": 0.2682470819307261,
"learning_rate": 9.182715602226341e-05,
"loss": 0.5625,
"mean_token_accuracy": 0.8270745746791363,
"num_tokens": 284763929.0,
"step": 310
},
{
"epoch": 0.447880565182618,
"grad_norm": 0.2962012849994535,
"learning_rate": 9.152713127912355e-05,
"loss": 0.5848,
"mean_token_accuracy": 0.8201167277991772,
"num_tokens": 289376903.0,
"step": 315
},
{
"epoch": 0.454989780502977,
"grad_norm": 0.28564514411407316,
"learning_rate": 9.12222694342213e-05,
"loss": 0.5732,
"mean_token_accuracy": 0.8246621482074261,
"num_tokens": 293966796.0,
"step": 320
},
{
"epoch": 0.462098995823336,
"grad_norm": 0.30020425973519915,
"learning_rate": 9.091261085885646e-05,
"loss": 0.5606,
"mean_token_accuracy": 0.826822079718113,
"num_tokens": 298540346.0,
"step": 325
},
{
"epoch": 0.46920821114369504,
"grad_norm": 0.2887047887642146,
"learning_rate": 9.059819655953536e-05,
"loss": 0.5738,
"mean_token_accuracy": 0.823461939394474,
"num_tokens": 303112604.0,
"step": 330
},
{
"epoch": 0.476317426464054,
"grad_norm": 0.3180269352697689,
"learning_rate": 9.027906817254063e-05,
"loss": 0.5654,
"mean_token_accuracy": 0.8256018176674843,
"num_tokens": 307694241.0,
"step": 335
},
{
"epoch": 0.483426641784413,
"grad_norm": 0.29567931374872014,
"learning_rate": 8.995526795841753e-05,
"loss": 0.558,
"mean_token_accuracy": 0.8256605207920075,
"num_tokens": 312289299.0,
"step": 340
},
{
"epoch": 0.49053585710477204,
"grad_norm": 0.3336504103662035,
"learning_rate": 8.962683879637747e-05,
"loss": 0.5617,
"mean_token_accuracy": 0.8257805988192558,
"num_tokens": 316884766.0,
"step": 345
},
{
"epoch": 0.49764507242513106,
"grad_norm": 0.3705167375534613,
"learning_rate": 8.929382417861991e-05,
"loss": 0.561,
"mean_token_accuracy": 0.8267210200428963,
"num_tokens": 321461198.0,
"step": 350
},
{
"epoch": 0.5047542877454901,
"grad_norm": 0.2946584460529412,
"learning_rate": 8.895626820457283e-05,
"loss": 0.557,
"mean_token_accuracy": 0.828194110840559,
"num_tokens": 326064722.0,
"step": 355
},
{
"epoch": 0.5118635030658492,
"grad_norm": 0.31227448766803945,
"learning_rate": 8.861421557505282e-05,
"loss": 0.5522,
"mean_token_accuracy": 0.8295037761330605,
"num_tokens": 330652094.0,
"step": 360
},
{
"epoch": 0.5189727183862082,
"grad_norm": 1.0759474066945163,
"learning_rate": 8.826771158634567e-05,
"loss": 0.5629,
"mean_token_accuracy": 0.8260238766670227,
"num_tokens": 335255835.0,
"step": 365
},
{
"epoch": 0.5260819337065671,
"grad_norm": 0.2758992633553522,
"learning_rate": 8.791680212420797e-05,
"loss": 0.5502,
"mean_token_accuracy": 0.828965923935175,
"num_tokens": 339843476.0,
"step": 370
},
{
"epoch": 0.5331911490269261,
"grad_norm": 0.29696149610793166,
"learning_rate": 8.756153365779066e-05,
"loss": 0.5542,
"mean_token_accuracy": 0.8278730027377605,
"num_tokens": 344420533.0,
"step": 375
},
{
"epoch": 0.5403003643472851,
"grad_norm": 0.284706804181623,
"learning_rate": 8.720195323348545e-05,
"loss": 0.559,
"mean_token_accuracy": 0.8278782211244107,
"num_tokens": 349010370.0,
"step": 380
},
{
"epoch": 0.5474095796676441,
"grad_norm": 0.3046957362601185,
"learning_rate": 8.68381084686946e-05,
"loss": 0.5576,
"mean_token_accuracy": 0.8258513130247593,
"num_tokens": 353598451.0,
"step": 385
},
{
"epoch": 0.5545187949880032,
"grad_norm": 0.3134773718519533,
"learning_rate": 8.647004754552526e-05,
"loss": 0.5612,
"mean_token_accuracy": 0.8255665130913258,
"num_tokens": 358195615.0,
"step": 390
},
{
"epoch": 0.5616280103083622,
"grad_norm": 0.33349640254961,
"learning_rate": 8.609781920440891e-05,
"loss": 0.552,
"mean_token_accuracy": 0.8278413727879524,
"num_tokens": 362764034.0,
"step": 395
},
{
"epoch": 0.5687372256287212,
"grad_norm": 0.32034152048464726,
"learning_rate": 8.5721472737647e-05,
"loss": 0.5534,
"mean_token_accuracy": 0.8273369200527668,
"num_tokens": 367350265.0,
"step": 400
},
{
"epoch": 0.5687372256287212,
"eval_loss": 0.5274047255516052,
"eval_mean_token_accuracy": 0.8264030280866121,
"eval_num_tokens": 367350265.0,
"eval_runtime": 146.0134,
"eval_samples_per_second": 24.916,
"eval_steps_per_second": 0.781,
"step": 400
},
{
"epoch": 0.5758464409490802,
"grad_norm": 0.29085093151843905,
"learning_rate": 8.534105798288331e-05,
"loss": 0.5506,
"mean_token_accuracy": 0.830031219124794,
"num_tokens": 371939618.0,
"step": 405
},
{
"epoch": 0.5829556562694392,
"grad_norm": 0.27710417408529203,
"learning_rate": 8.49566253165043e-05,
"loss": 0.5439,
"mean_token_accuracy": 0.8304261237382888,
"num_tokens": 376519800.0,
"step": 410
},
{
"epoch": 0.5900648715897983,
"grad_norm": 0.2611394917691902,
"learning_rate": 8.456822564696789e-05,
"loss": 0.5409,
"mean_token_accuracy": 0.832954341173172,
"num_tokens": 381102299.0,
"step": 415
},
{
"epoch": 0.5971740869101573,
"grad_norm": 0.42771473321829473,
"learning_rate": 8.417591040806213e-05,
"loss": 0.5504,
"mean_token_accuracy": 0.8300940133631229,
"num_tokens": 385700779.0,
"step": 420
},
{
"epoch": 0.6042833022305163,
"grad_norm": 0.28194050483515865,
"learning_rate": 8.377973155209387e-05,
"loss": 0.5553,
"mean_token_accuracy": 0.8270630918443203,
"num_tokens": 390294365.0,
"step": 425
},
{
"epoch": 0.6113925175508753,
"grad_norm": 0.27563889901609234,
"learning_rate": 8.337974154300913e-05,
"loss": 0.5427,
"mean_token_accuracy": 0.8309814311563969,
"num_tokens": 394889149.0,
"step": 430
},
{
"epoch": 0.6185017328712343,
"grad_norm": 0.27875362292884753,
"learning_rate": 8.297599334944542e-05,
"loss": 0.5561,
"mean_token_accuracy": 0.8275676898658275,
"num_tokens": 399459807.0,
"step": 435
},
{
"epoch": 0.6256109481915934,
"grad_norm": 0.7336148967265075,
"learning_rate": 8.256854043771754e-05,
"loss": 0.5507,
"mean_token_accuracy": 0.8285100273787975,
"num_tokens": 404034333.0,
"step": 440
},
{
"epoch": 0.6327201635119524,
"grad_norm": 0.3259646654441019,
"learning_rate": 8.215743676473719e-05,
"loss": 0.5503,
"mean_token_accuracy": 0.8290993146598339,
"num_tokens": 408627270.0,
"step": 445
},
{
"epoch": 0.6398293788323114,
"grad_norm": 0.3012299941832976,
"learning_rate": 8.174273677086779e-05,
"loss": 0.552,
"mean_token_accuracy": 0.8279682919383049,
"num_tokens": 413222911.0,
"step": 450
},
{
"epoch": 0.6469385941526704,
"grad_norm": 0.30771992691522176,
"learning_rate": 8.132449537271519e-05,
"loss": 0.552,
"mean_token_accuracy": 0.8296807646751404,
"num_tokens": 417806274.0,
"step": 455
},
{
"epoch": 0.6540478094730294,
"grad_norm": 0.2810763807856677,
"learning_rate": 8.090276795585531e-05,
"loss": 0.5414,
"mean_token_accuracy": 0.8314659893512726,
"num_tokens": 422401434.0,
"step": 460
},
{
"epoch": 0.6611570247933884,
"grad_norm": 0.2672336811508722,
"learning_rate": 8.047761036749985e-05,
"loss": 0.5564,
"mean_token_accuracy": 0.8265900291502476,
"num_tokens": 426986385.0,
"step": 465
},
{
"epoch": 0.6682662401137475,
"grad_norm": 0.25924906311163326,
"learning_rate": 8.004907890910055e-05,
"loss": 0.5452,
"mean_token_accuracy": 0.8297064855694771,
"num_tokens": 431585703.0,
"step": 470
},
{
"epoch": 0.6753754554341065,
"grad_norm": 0.2772688573388134,
"learning_rate": 7.961723032889358e-05,
"loss": 0.5292,
"mean_token_accuracy": 0.8346129797399044,
"num_tokens": 436150194.0,
"step": 475
},
{
"epoch": 0.6824846707544655,
"grad_norm": 0.25573353155086187,
"learning_rate": 7.918212181438467e-05,
"loss": 0.5397,
"mean_token_accuracy": 0.8314497999846935,
"num_tokens": 440736901.0,
"step": 480
},
{
"epoch": 0.6895938860748245,
"grad_norm": 0.2640386419783165,
"learning_rate": 7.874381098477599e-05,
"loss": 0.5359,
"mean_token_accuracy": 0.8328767582774163,
"num_tokens": 445334774.0,
"step": 485
},
{
"epoch": 0.6967031013951835,
"grad_norm": 0.2662269663206075,
"learning_rate": 7.830235588333597e-05,
"loss": 0.5578,
"mean_token_accuracy": 0.8268053226172924,
"num_tokens": 449908855.0,
"step": 490
},
{
"epoch": 0.7038123167155426,
"grad_norm": 0.2756351015892551,
"learning_rate": 7.785781496971297e-05,
"loss": 0.5503,
"mean_token_accuracy": 0.8284729138016701,
"num_tokens": 454513487.0,
"step": 495
},
{
"epoch": 0.7109215320359016,
"grad_norm": 0.4547105928976161,
"learning_rate": 7.741024711219366e-05,
"loss": 0.5431,
"mean_token_accuracy": 0.8298681430518627,
"num_tokens": 459106365.0,
"step": 500
},
{
"epoch": 0.7109215320359016,
"eval_loss": 0.5168540477752686,
"eval_mean_token_accuracy": 0.8290872861418808,
"eval_num_tokens": 459106365.0,
"eval_runtime": 146.2066,
"eval_samples_per_second": 24.883,
"eval_steps_per_second": 0.78,
"step": 500
},
{
"epoch": 0.7180307473562606,
"grad_norm": 1.6021704699780053,
"learning_rate": 7.695971157990754e-05,
"loss": 0.5646,
"mean_token_accuracy": 0.8263038910925389,
"num_tokens": 463703240.0,
"step": 505
},
{
"epoch": 0.7251399626766196,
"grad_norm": 4.625968090811763,
"learning_rate": 7.650626803497806e-05,
"loss": 0.5581,
"mean_token_accuracy": 0.8270722553133965,
"num_tokens": 468295660.0,
"step": 510
},
{
"epoch": 0.7322491779969785,
"grad_norm": 0.27503115183353516,
"learning_rate": 7.604997652462205e-05,
"loss": 0.5492,
"mean_token_accuracy": 0.8294327199459076,
"num_tokens": 472896751.0,
"step": 515
},
{
"epoch": 0.7393583933173375,
"grad_norm": 0.267416722991217,
"learning_rate": 7.55908974731978e-05,
"loss": 0.5418,
"mean_token_accuracy": 0.8326966613531113,
"num_tokens": 477480918.0,
"step": 520
},
{
"epoch": 0.7464676086376966,
"grad_norm": 0.25628361203172423,
"learning_rate": 7.512909167420347e-05,
"loss": 0.5404,
"mean_token_accuracy": 0.8324044570326805,
"num_tokens": 482064392.0,
"step": 525
},
{
"epoch": 0.7535768239580556,
"grad_norm": 0.24597696845219366,
"learning_rate": 7.466462028222654e-05,
"loss": 0.5353,
"mean_token_accuracy": 0.8331540204584599,
"num_tokens": 486649806.0,
"step": 530
},
{
"epoch": 0.7606860392784146,
"grad_norm": 0.2497969231256322,
"learning_rate": 7.419754480484536e-05,
"loss": 0.5378,
"mean_token_accuracy": 0.8323175966739654,
"num_tokens": 491217398.0,
"step": 535
},
{
"epoch": 0.7677952545987736,
"grad_norm": 0.27136426093422567,
"learning_rate": 7.3727927094484e-05,
"loss": 0.5303,
"mean_token_accuracy": 0.8346898458898068,
"num_tokens": 495798334.0,
"step": 540
},
{
"epoch": 0.7749044699191326,
"grad_norm": 0.263928683082665,
"learning_rate": 7.32558293402215e-05,
"loss": 0.5193,
"mean_token_accuracy": 0.8367893837392331,
"num_tokens": 500382331.0,
"step": 545
},
{
"epoch": 0.7820136852394917,
"grad_norm": 0.2697485453052082,
"learning_rate": 7.27813140595565e-05,
"loss": 0.5249,
"mean_token_accuracy": 0.836308328807354,
"num_tokens": 504972961.0,
"step": 550
},
{
"epoch": 0.7891229005598507,
"grad_norm": 0.47577994241811294,
"learning_rate": 7.23044440901283e-05,
"loss": 0.5386,
"mean_token_accuracy": 0.832004614919424,
"num_tokens": 509556175.0,
"step": 555
},
{
"epoch": 0.7962321158802097,
"grad_norm": 0.26812210950339255,
"learning_rate": 7.182528258139563e-05,
"loss": 0.5327,
"mean_token_accuracy": 0.8331871695816517,
"num_tokens": 514159170.0,
"step": 560
},
{
"epoch": 0.8033413312005687,
"grad_norm": 0.2590503131411491,
"learning_rate": 7.13438929862741e-05,
"loss": 0.5447,
"mean_token_accuracy": 0.8303000062704087,
"num_tokens": 518758083.0,
"step": 565
},
{
"epoch": 0.8104505465209277,
"grad_norm": 0.2700164600845211,
"learning_rate": 7.086033905273344e-05,
"loss": 0.5367,
"mean_token_accuracy": 0.8323484763503075,
"num_tokens": 523345629.0,
"step": 570
},
{
"epoch": 0.8175597618412868,
"grad_norm": 0.26967028018820877,
"learning_rate": 7.037468481535567e-05,
"loss": 0.5212,
"mean_token_accuracy": 0.8371426187455654,
"num_tokens": 527940592.0,
"step": 575
},
{
"epoch": 0.8246689771616458,
"grad_norm": 0.3154368910279167,
"learning_rate": 6.988699458685537e-05,
"loss": 0.5275,
"mean_token_accuracy": 0.8351783238351345,
"num_tokens": 532516910.0,
"step": 580
},
{
"epoch": 0.8317781924820048,
"grad_norm": 0.26226153440650185,
"learning_rate": 6.9397332949563e-05,
"loss": 0.5335,
"mean_token_accuracy": 0.8329351760447026,
"num_tokens": 537121758.0,
"step": 585
},
{
"epoch": 0.8388874078023638,
"grad_norm": 0.31223173870328286,
"learning_rate": 6.890576474687263e-05,
"loss": 0.5458,
"mean_token_accuracy": 0.829648780822754,
"num_tokens": 541734519.0,
"step": 590
},
{
"epoch": 0.8459966231227228,
"grad_norm": 0.2565970150956528,
"learning_rate": 6.841235507465515e-05,
"loss": 0.5415,
"mean_token_accuracy": 0.8324811846017838,
"num_tokens": 546326546.0,
"step": 595
},
{
"epoch": 0.8531058384430819,
"grad_norm": 0.29462309278409743,
"learning_rate": 6.791716927263778e-05,
"loss": 0.5354,
"mean_token_accuracy": 0.8325764186680317,
"num_tokens": 550923667.0,
"step": 600
},
{
"epoch": 0.8531058384430819,
"eval_loss": 0.5030205249786377,
"eval_mean_token_accuracy": 0.8328834866222582,
"eval_num_tokens": 550923667.0,
"eval_runtime": 145.5099,
"eval_samples_per_second": 25.002,
"eval_steps_per_second": 0.783,
"step": 600
},
{
"epoch": 0.8602150537634409,
"grad_norm": 0.2995740161508053,
"learning_rate": 6.742027291575156e-05,
"loss": 0.5351,
"mean_token_accuracy": 0.8337548352777958,
"num_tokens": 555521300.0,
"step": 605
},
{
"epoch": 0.8673242690837999,
"grad_norm": 0.256895454866442,
"learning_rate": 6.692173180544768e-05,
"loss": 0.527,
"mean_token_accuracy": 0.8346491247415543,
"num_tokens": 560114622.0,
"step": 610
},
{
"epoch": 0.8744334844041589,
"grad_norm": 0.26124663621839667,
"learning_rate": 6.642161196098351e-05,
"loss": 0.5299,
"mean_token_accuracy": 0.835064522176981,
"num_tokens": 564707120.0,
"step": 615
},
{
"epoch": 0.8815426997245179,
"grad_norm": 0.30629789668279445,
"learning_rate": 6.591997961068024e-05,
"loss": 0.5391,
"mean_token_accuracy": 0.8325687229633332,
"num_tokens": 569285949.0,
"step": 620
},
{
"epoch": 0.888651915044877,
"grad_norm": 0.2517010032545197,
"learning_rate": 6.541690118315245e-05,
"loss": 0.528,
"mean_token_accuracy": 0.834906804561615,
"num_tokens": 573871769.0,
"step": 625
},
{
"epoch": 0.895761130365236,
"grad_norm": 0.3714356282666368,
"learning_rate": 6.491244329851133e-05,
"loss": 0.521,
"mean_token_accuracy": 0.8374850310385227,
"num_tokens": 578461250.0,
"step": 630
},
{
"epoch": 0.902870345685595,
"grad_norm": 0.2513550517622928,
"learning_rate": 6.440667275954262e-05,
"loss": 0.5151,
"mean_token_accuracy": 0.8384780243039132,
"num_tokens": 583046607.0,
"step": 635
},
{
"epoch": 0.909979561005954,
"grad_norm": 0.2790784344252937,
"learning_rate": 6.389965654286011e-05,
"loss": 0.5287,
"mean_token_accuracy": 0.8349935576319695,
"num_tokens": 587648232.0,
"step": 640
},
{
"epoch": 0.917088776326313,
"grad_norm": 0.27767689120972117,
"learning_rate": 6.339146179003636e-05,
"loss": 0.5207,
"mean_token_accuracy": 0.837136809527874,
"num_tokens": 592239729.0,
"step": 645
},
{
"epoch": 0.924197991646672,
"grad_norm": 0.2805149976836277,
"learning_rate": 6.288215579871148e-05,
"loss": 0.5229,
"mean_token_accuracy": 0.8374404884874821,
"num_tokens": 596831306.0,
"step": 650
},
{
"epoch": 0.9313072069670311,
"grad_norm": 0.24703194529226574,
"learning_rate": 6.23718060136812e-05,
"loss": 0.5152,
"mean_token_accuracy": 0.8385937295854091,
"num_tokens": 601427733.0,
"step": 655
},
{
"epoch": 0.9384164222873901,
"grad_norm": 0.33949011504626453,
"learning_rate": 6.186048001796556e-05,
"loss": 0.5204,
"mean_token_accuracy": 0.8384438544511795,
"num_tokens": 606006466.0,
"step": 660
},
{
"epoch": 0.945525637607749,
"grad_norm": 0.24749318396547174,
"learning_rate": 6.134824552385915e-05,
"loss": 0.5256,
"mean_token_accuracy": 0.8357278972864151,
"num_tokens": 610597552.0,
"step": 665
},
{
"epoch": 0.952634852928108,
"grad_norm": 0.26267746218214755,
"learning_rate": 6.0835170363964434e-05,
"loss": 0.528,
"mean_token_accuracy": 0.8351906433701515,
"num_tokens": 615193994.0,
"step": 670
},
{
"epoch": 0.959744068248467,
"grad_norm": 0.25519090759528035,
"learning_rate": 6.032132248220893e-05,
"loss": 0.518,
"mean_token_accuracy": 0.8378535941243171,
"num_tokens": 619786315.0,
"step": 675
},
{
"epoch": 0.966853283568826,
"grad_norm": 0.25149430173186577,
"learning_rate": 5.9806769924847784e-05,
"loss": 0.5175,
"mean_token_accuracy": 0.8372136250138282,
"num_tokens": 624383919.0,
"step": 680
},
{
"epoch": 0.9739624988891851,
"grad_norm": 0.2669872598294479,
"learning_rate": 5.929158083145271e-05,
"loss": 0.5166,
"mean_token_accuracy": 0.8380297608673573,
"num_tokens": 628976906.0,
"step": 685
},
{
"epoch": 0.9810717142095441,
"grad_norm": 0.3079990980800955,
"learning_rate": 5.8775823425888664e-05,
"loss": 0.5171,
"mean_token_accuracy": 0.8365243822336197,
"num_tokens": 633557562.0,
"step": 690
},
{
"epoch": 0.9881809295299031,
"grad_norm": 0.26934237379344833,
"learning_rate": 5.825956600727932e-05,
"loss": 0.5176,
"mean_token_accuracy": 0.8371751248836518,
"num_tokens": 638143938.0,
"step": 695
},
{
"epoch": 0.9952901448502621,
"grad_norm": 0.24892879578477203,
"learning_rate": 5.774287694096246e-05,
"loss": 0.5203,
"mean_token_accuracy": 0.8368992209434509,
"num_tokens": 642760408.0,
"step": 700
},
{
"epoch": 0.9952901448502621,
"eval_loss": 0.49169814586639404,
"eval_mean_token_accuracy": 0.8366760449451313,
"eval_num_tokens": 642760408.0,
"eval_runtime": 148.141,
"eval_samples_per_second": 24.558,
"eval_steps_per_second": 0.77,
"step": 700
},
{
"epoch": 1.0014218430640718,
"grad_norm": 0.5358904769553885,
"learning_rate": 5.72258246494368e-05,
"loss": 0.4893,
"mean_token_accuracy": 0.8436046752376832,
"num_tokens": 646718128.0,
"step": 705
},
{
"epoch": 1.008531058384431,
"grad_norm": 0.25743890956382126,
"learning_rate": 5.6708477603301146e-05,
"loss": 0.461,
"mean_token_accuracy": 0.8506338618695736,
"num_tokens": 651304404.0,
"step": 710
},
{
"epoch": 1.0156402737047898,
"grad_norm": 0.2648866270558085,
"learning_rate": 5.6190904312187154e-05,
"loss": 0.4544,
"mean_token_accuracy": 0.8519260853528976,
"num_tokens": 655879909.0,
"step": 715
},
{
"epoch": 1.022749489025149,
"grad_norm": 0.27694330822934976,
"learning_rate": 5.567317331568687e-05,
"loss": 0.4474,
"mean_token_accuracy": 0.8545098066329956,
"num_tokens": 660449626.0,
"step": 720
},
{
"epoch": 1.0298587043455079,
"grad_norm": 0.24825528169946715,
"learning_rate": 5.515535317427657e-05,
"loss": 0.4517,
"mean_token_accuracy": 0.8533940657973289,
"num_tokens": 665058163.0,
"step": 725
},
{
"epoch": 1.0369679196658668,
"grad_norm": 0.24464581183689546,
"learning_rate": 5.463751246023746e-05,
"loss": 0.4559,
"mean_token_accuracy": 0.8523735709488391,
"num_tokens": 669654595.0,
"step": 730
},
{
"epoch": 1.044077134986226,
"grad_norm": 0.24930171479148333,
"learning_rate": 5.4119719748575106e-05,
"loss": 0.4487,
"mean_token_accuracy": 0.8542089037597179,
"num_tokens": 674232882.0,
"step": 735
},
{
"epoch": 1.0511863503065848,
"grad_norm": 0.23303088594874635,
"learning_rate": 5.360204360793836e-05,
"loss": 0.4436,
"mean_token_accuracy": 0.8547257304191589,
"num_tokens": 678813498.0,
"step": 740
},
{
"epoch": 1.058295565626944,
"grad_norm": 0.317097982341769,
"learning_rate": 5.308455259153915e-05,
"loss": 0.458,
"mean_token_accuracy": 0.8515614397823811,
"num_tokens": 683401148.0,
"step": 745
},
{
"epoch": 1.0654047809473028,
"grad_norm": 0.24160258781744343,
"learning_rate": 5.256731522807436e-05,
"loss": 0.4506,
"mean_token_accuracy": 0.8526393964886665,
"num_tokens": 687982154.0,
"step": 750
},
{
"epoch": 1.072513996267662,
"grad_norm": 0.23602108922437653,
"learning_rate": 5.205040001265094e-05,
"loss": 0.4515,
"mean_token_accuracy": 0.8521531477570534,
"num_tokens": 692583016.0,
"step": 755
},
{
"epoch": 1.0796232115880209,
"grad_norm": 0.2431546567595459,
"learning_rate": 5.1533875397715345e-05,
"loss": 0.455,
"mean_token_accuracy": 0.8529531605541706,
"num_tokens": 697183950.0,
"step": 760
},
{
"epoch": 1.08673242690838,
"grad_norm": 0.27597324346348756,
"learning_rate": 5.101780978398888e-05,
"loss": 0.4518,
"mean_token_accuracy": 0.8528432317078114,
"num_tokens": 701785548.0,
"step": 765
},
{
"epoch": 1.093841642228739,
"grad_norm": 0.26932926236063864,
"learning_rate": 5.050227151140958e-05,
"loss": 0.4536,
"mean_token_accuracy": 0.852679468691349,
"num_tokens": 706364188.0,
"step": 770
},
{
"epoch": 1.100950857549098,
"grad_norm": 0.2587220894683173,
"learning_rate": 4.998732885008244e-05,
"loss": 0.4503,
"mean_token_accuracy": 0.8526183031499386,
"num_tokens": 710949271.0,
"step": 775
},
{
"epoch": 1.108060072869457,
"grad_norm": 0.24430696998738718,
"learning_rate": 4.947304999123867e-05,
"loss": 0.4357,
"mean_token_accuracy": 0.8572968378663063,
"num_tokens": 715539336.0,
"step": 780
},
{
"epoch": 1.115169288189816,
"grad_norm": 0.24614402366250857,
"learning_rate": 4.895950303820552e-05,
"loss": 0.4525,
"mean_token_accuracy": 0.8526603005826473,
"num_tokens": 720147357.0,
"step": 785
},
{
"epoch": 1.122278503510175,
"grad_norm": 0.23262198319374294,
"learning_rate": 4.844675599738765e-05,
"loss": 0.4523,
"mean_token_accuracy": 0.852922348678112,
"num_tokens": 724741149.0,
"step": 790
},
{
"epoch": 1.1293877188305341,
"grad_norm": 0.2551816873924689,
"learning_rate": 4.793487676926142e-05,
"loss": 0.4562,
"mean_token_accuracy": 0.8518377915024757,
"num_tokens": 729327424.0,
"step": 795
},
{
"epoch": 1.136496934150893,
"grad_norm": 0.23754167080648592,
"learning_rate": 4.742393313938327e-05,
"loss": 0.445,
"mean_token_accuracy": 0.8547273397445678,
"num_tokens": 733921218.0,
"step": 800
},
{
"epoch": 1.136496934150893,
"eval_loss": 0.4879998564720154,
"eval_mean_token_accuracy": 0.8380277005203983,
"eval_num_tokens": 733921218.0,
"eval_runtime": 146.7948,
"eval_samples_per_second": 24.783,
"eval_steps_per_second": 0.777,
"step": 800
},
{
"epoch": 1.1436061494712522,
"grad_norm": 0.25050469601877845,
"learning_rate": 4.6913992769413026e-05,
"loss": 0.4552,
"mean_token_accuracy": 0.8521495588123799,
"num_tokens": 738503816.0,
"step": 805
},
{
"epoch": 1.150715364791611,
"grad_norm": 0.24476661787598053,
"learning_rate": 4.6405123188153966e-05,
"loss": 0.4506,
"mean_token_accuracy": 0.8532384999096394,
"num_tokens": 743095770.0,
"step": 810
},
{
"epoch": 1.1578245801119702,
"grad_norm": 0.24115136773182058,
"learning_rate": 4.589739178261028e-05,
"loss": 0.4471,
"mean_token_accuracy": 0.8549422182142734,
"num_tokens": 747676184.0,
"step": 815
},
{
"epoch": 1.1649337954323291,
"grad_norm": 0.24283949811905522,
"learning_rate": 4.5390865789063344e-05,
"loss": 0.448,
"mean_token_accuracy": 0.8543575026094914,
"num_tokens": 752274534.0,
"step": 820
},
{
"epoch": 1.1720430107526882,
"grad_norm": 0.2701107129425895,
"learning_rate": 4.4885612284167955e-05,
"loss": 0.4411,
"mean_token_accuracy": 0.8565104402601719,
"num_tokens": 756863683.0,
"step": 825
},
{
"epoch": 1.1791522260730471,
"grad_norm": 0.2886054721404824,
"learning_rate": 4.4381698176069754e-05,
"loss": 0.4379,
"mean_token_accuracy": 0.8567862503230572,
"num_tokens": 761453110.0,
"step": 830
},
{
"epoch": 1.1862614413934063,
"grad_norm": 0.2561982737144238,
"learning_rate": 4.387919019554487e-05,
"loss": 0.4532,
"mean_token_accuracy": 0.8531202852725983,
"num_tokens": 766041248.0,
"step": 835
},
{
"epoch": 1.1933706567137652,
"grad_norm": 0.26412588441218454,
"learning_rate": 4.3378154887163144e-05,
"loss": 0.4453,
"mean_token_accuracy": 0.853339533507824,
"num_tokens": 770624920.0,
"step": 840
},
{
"epoch": 1.2004798720341243,
"grad_norm": 0.25032821222177587,
"learning_rate": 4.287865860047596e-05,
"loss": 0.4558,
"mean_token_accuracy": 0.8522251404821872,
"num_tokens": 775225729.0,
"step": 845
},
{
"epoch": 1.2075890873544832,
"grad_norm": 0.23998083533004458,
"learning_rate": 4.2380767481229886e-05,
"loss": 0.4418,
"mean_token_accuracy": 0.8569207176566124,
"num_tokens": 779811918.0,
"step": 850
},
{
"epoch": 1.2146983026748424,
"grad_norm": 0.2456015755421057,
"learning_rate": 4.1884547462607326e-05,
"loss": 0.4454,
"mean_token_accuracy": 0.8553664483129978,
"num_tokens": 784391305.0,
"step": 855
},
{
"epoch": 1.2218075179952013,
"grad_norm": 0.25612737416807746,
"learning_rate": 4.139006425649541e-05,
"loss": 0.4504,
"mean_token_accuracy": 0.8527485050261021,
"num_tokens": 788981682.0,
"step": 860
},
{
"epoch": 1.2289167333155602,
"grad_norm": 0.24215144672428524,
"learning_rate": 4.089738334478399e-05,
"loss": 0.4466,
"mean_token_accuracy": 0.8540120802819728,
"num_tokens": 793548878.0,
"step": 865
},
{
"epoch": 1.2360259486359193,
"grad_norm": 0.251956160570565,
"learning_rate": 4.0406569970694285e-05,
"loss": 0.4514,
"mean_token_accuracy": 0.8536942526698112,
"num_tokens": 798145090.0,
"step": 870
},
{
"epoch": 1.2431351639562784,
"grad_norm": 0.24137828427946414,
"learning_rate": 3.991768913013904e-05,
"loss": 0.4408,
"mean_token_accuracy": 0.8566184468567372,
"num_tokens": 802721141.0,
"step": 875
},
{
"epoch": 1.2502443792766373,
"grad_norm": 0.3769699788745637,
"learning_rate": 3.943080556311536e-05,
"loss": 0.438,
"mean_token_accuracy": 0.8581221453845501,
"num_tokens": 807303824.0,
"step": 880
},
{
"epoch": 1.2573535945969962,
"grad_norm": 0.251278759950789,
"learning_rate": 3.894598374513174e-05,
"loss": 0.4485,
"mean_token_accuracy": 0.8541063219308853,
"num_tokens": 811911762.0,
"step": 885
},
{
"epoch": 1.2644628099173554,
"grad_norm": 0.24068163801342848,
"learning_rate": 3.846328787866964e-05,
"loss": 0.4339,
"mean_token_accuracy": 0.859130322188139,
"num_tokens": 816508640.0,
"step": 890
},
{
"epoch": 1.2715720252377145,
"grad_norm": 0.23232711368022352,
"learning_rate": 3.798278188468164e-05,
"loss": 0.4445,
"mean_token_accuracy": 0.8543654963374138,
"num_tokens": 821100737.0,
"step": 895
},
{
"epoch": 1.2786812405580734,
"grad_norm": 0.2368572559014999,
"learning_rate": 3.750452939412667e-05,
"loss": 0.4434,
"mean_token_accuracy": 0.8547687388956546,
"num_tokens": 825694727.0,
"step": 900
},
{
"epoch": 1.2786812405580734,
"eval_loss": 0.4800785183906555,
"eval_mean_token_accuracy": 0.8407511988229919,
"eval_num_tokens": 825694727.0,
"eval_runtime": 146.4602,
"eval_samples_per_second": 24.84,
"eval_steps_per_second": 0.778,
"step": 900
},
{
"epoch": 1.2857904558784323,
"grad_norm": 0.26166517034573067,
"learning_rate": 3.7028593739543715e-05,
"loss": 0.4475,
"mean_token_accuracy": 0.854764747619629,
"num_tokens": 830291180.0,
"step": 905
},
{
"epoch": 1.2928996711987915,
"grad_norm": 0.24015937616460478,
"learning_rate": 3.6555037946664926e-05,
"loss": 0.4455,
"mean_token_accuracy": 0.8552566647529602,
"num_tokens": 834892125.0,
"step": 910
},
{
"epoch": 1.3000088865191506,
"grad_norm": 0.252313420976958,
"learning_rate": 3.608392472606956e-05,
"loss": 0.4441,
"mean_token_accuracy": 0.8559129044413567,
"num_tokens": 839486375.0,
"step": 915
},
{
"epoch": 1.3071181018395095,
"grad_norm": 0.256487918121681,
"learning_rate": 3.5615316464879445e-05,
"loss": 0.4401,
"mean_token_accuracy": 0.8565216913819313,
"num_tokens": 844107444.0,
"step": 920
},
{
"epoch": 1.3142273171598684,
"grad_norm": 0.23448215102314007,
"learning_rate": 3.5149275218497445e-05,
"loss": 0.4383,
"mean_token_accuracy": 0.8571599997580052,
"num_tokens": 848704492.0,
"step": 925
},
{
"epoch": 1.3213365324802275,
"grad_norm": 0.24419792529251788,
"learning_rate": 3.4685862702389714e-05,
"loss": 0.4429,
"mean_token_accuracy": 0.855844734609127,
"num_tokens": 853292585.0,
"step": 930
},
{
"epoch": 1.3284457478005864,
"grad_norm": 0.23566825561303636,
"learning_rate": 3.422514028391304e-05,
"loss": 0.4354,
"mean_token_accuracy": 0.8570930063724518,
"num_tokens": 857867604.0,
"step": 935
},
{
"epoch": 1.3355549631209456,
"grad_norm": 0.2454162982602229,
"learning_rate": 3.376716897418831e-05,
"loss": 0.4447,
"mean_token_accuracy": 0.8552064374089241,
"num_tokens": 862460961.0,
"step": 940
},
{
"epoch": 1.3426641784413045,
"grad_norm": 0.2524163496767361,
"learning_rate": 3.331200942002113e-05,
"loss": 0.4525,
"mean_token_accuracy": 0.8537895001471043,
"num_tokens": 867058298.0,
"step": 945
},
{
"epoch": 1.3497733937616636,
"grad_norm": 0.23190520165291026,
"learning_rate": 3.2859721895870635e-05,
"loss": 0.44,
"mean_token_accuracy": 0.8565752863883972,
"num_tokens": 871661806.0,
"step": 950
},
{
"epoch": 1.3568826090820225,
"grad_norm": 0.24782970977401894,
"learning_rate": 3.2410366295867664e-05,
"loss": 0.4352,
"mean_token_accuracy": 0.8579383887350559,
"num_tokens": 876250262.0,
"step": 955
},
{
"epoch": 1.3639918244023816,
"grad_norm": 0.22786025696468146,
"learning_rate": 3.19640021258833e-05,
"loss": 0.444,
"mean_token_accuracy": 0.8550498209893703,
"num_tokens": 880839029.0,
"step": 960
},
{
"epoch": 1.3711010397227406,
"grad_norm": 0.2265711418699179,
"learning_rate": 3.152068849564879e-05,
"loss": 0.4435,
"mean_token_accuracy": 0.8563594095408916,
"num_tokens": 885417939.0,
"step": 965
},
{
"epoch": 1.3782102550430997,
"grad_norm": 0.23977507514952898,
"learning_rate": 3.1080484110927954e-05,
"loss": 0.4325,
"mean_token_accuracy": 0.8590381443500519,
"num_tokens": 890005207.0,
"step": 970
},
{
"epoch": 1.3853194703634586,
"grad_norm": 0.24689756755824815,
"learning_rate": 3.0643447265743096e-05,
"loss": 0.44,
"mean_token_accuracy": 0.85642144754529,
"num_tokens": 894591297.0,
"step": 975
},
{
"epoch": 1.3924286856838177,
"grad_norm": 0.24051873631020942,
"learning_rate": 3.0209635834655392e-05,
"loss": 0.435,
"mean_token_accuracy": 0.8576522074639797,
"num_tokens": 899178832.0,
"step": 980
},
{
"epoch": 1.3995379010041766,
"grad_norm": 0.2413492029135495,
"learning_rate": 2.9779107265100892e-05,
"loss": 0.4369,
"mean_token_accuracy": 0.857710150629282,
"num_tokens": 903773147.0,
"step": 985
},
{
"epoch": 1.4066471163245358,
"grad_norm": 0.23506138046697497,
"learning_rate": 2.9351918569783006e-05,
"loss": 0.4364,
"mean_token_accuracy": 0.8576699584722519,
"num_tokens": 908371284.0,
"step": 990
},
{
"epoch": 1.4137563316448947,
"grad_norm": 0.25438867805085685,
"learning_rate": 2.892812631912265e-05,
"loss": 0.4349,
"mean_token_accuracy": 0.8586409255862236,
"num_tokens": 912978481.0,
"step": 995
},
{
"epoch": 1.4208655469652536,
"grad_norm": 0.24429497699288996,
"learning_rate": 2.8507786633766877e-05,
"loss": 0.4354,
"mean_token_accuracy": 0.8573046490550041,
"num_tokens": 917574029.0,
"step": 1000
},
{
"epoch": 1.4208655469652536,
"eval_loss": 0.47304314374923706,
"eval_mean_token_accuracy": 0.842672534156264,
"eval_num_tokens": 917574029.0,
"eval_runtime": 145.3562,
"eval_samples_per_second": 25.028,
"eval_steps_per_second": 0.784,
"step": 1000
},
{
"epoch": 1.4279747622856127,
"grad_norm": 0.24463063083449332,
"learning_rate": 2.809095517715713e-05,
"loss": 0.4303,
"mean_token_accuracy": 0.858917984366417,
"num_tokens": 922160147.0,
"step": 1005
},
{
"epoch": 1.4350839776059718,
"grad_norm": 0.24348846567727375,
"learning_rate": 2.7677687148157998e-05,
"loss": 0.4367,
"mean_token_accuracy": 0.8577364660799504,
"num_tokens": 926746028.0,
"step": 1010
},
{
"epoch": 1.4421931929263307,
"grad_norm": 0.24745049020205356,
"learning_rate": 2.7268037273747525e-05,
"loss": 0.4368,
"mean_token_accuracy": 0.857840034365654,
"num_tokens": 931337261.0,
"step": 1015
},
{
"epoch": 1.4493024082466897,
"grad_norm": 0.2439587698234042,
"learning_rate": 2.686205980176998e-05,
"loss": 0.4447,
"mean_token_accuracy": 0.8548872321844101,
"num_tokens": 935941769.0,
"step": 1020
},
{
"epoch": 1.4564116235670488,
"grad_norm": 0.25142114078442956,
"learning_rate": 2.6459808493752102e-05,
"loss": 0.4284,
"mean_token_accuracy": 0.8603815868496895,
"num_tokens": 940535643.0,
"step": 1025
},
{
"epoch": 1.463520838887408,
"grad_norm": 0.2444154895688051,
"learning_rate": 2.606133661778377e-05,
"loss": 0.4368,
"mean_token_accuracy": 0.8575351513922215,
"num_tokens": 945124519.0,
"step": 1030
},
{
"epoch": 1.4706300542077668,
"grad_norm": 0.2397327728518288,
"learning_rate": 2.5666696941463885e-05,
"loss": 0.4307,
"mean_token_accuracy": 0.8594269149005413,
"num_tokens": 949709974.0,
"step": 1035
},
{
"epoch": 1.4777392695281257,
"grad_norm": 0.3077470484547689,
"learning_rate": 2.5275941724912743e-05,
"loss": 0.4288,
"mean_token_accuracy": 0.8588724002242089,
"num_tokens": 954294899.0,
"step": 1040
},
{
"epoch": 1.4848484848484849,
"grad_norm": 0.24584716924955974,
"learning_rate": 2.4889122713851394e-05,
"loss": 0.4304,
"mean_token_accuracy": 0.8590269833803177,
"num_tokens": 958889833.0,
"step": 1045
},
{
"epoch": 1.491957700168844,
"grad_norm": 0.24260820183680837,
"learning_rate": 2.4506291132749272e-05,
"loss": 0.4322,
"mean_token_accuracy": 0.8588926158845425,
"num_tokens": 963479630.0,
"step": 1050
},
{
"epoch": 1.499066915489203,
"grad_norm": 0.2512439219193439,
"learning_rate": 2.4127497678040846e-05,
"loss": 0.4338,
"mean_token_accuracy": 0.8590321697294712,
"num_tokens": 968086693.0,
"step": 1055
},
{
"epoch": 1.5061761308095618,
"grad_norm": 0.25788120133019554,
"learning_rate": 2.375279251141201e-05,
"loss": 0.4302,
"mean_token_accuracy": 0.8599278099834919,
"num_tokens": 972668807.0,
"step": 1060
},
{
"epoch": 1.513285346129921,
"grad_norm": 0.24857387974370135,
"learning_rate": 2.338222525315758e-05,
"loss": 0.4371,
"mean_token_accuracy": 0.8579599760472775,
"num_tokens": 977267842.0,
"step": 1065
},
{
"epoch": 1.52039456145028,
"grad_norm": 0.24022880991860499,
"learning_rate": 2.301584497561024e-05,
"loss": 0.4234,
"mean_token_accuracy": 0.862085721641779,
"num_tokens": 981857003.0,
"step": 1070
},
{
"epoch": 1.527503776770639,
"grad_norm": 0.27120541109477303,
"learning_rate": 2.2653700196642134e-05,
"loss": 0.4396,
"mean_token_accuracy": 0.857264555990696,
"num_tokens": 986456929.0,
"step": 1075
},
{
"epoch": 1.5346129920909979,
"grad_norm": 0.24114703590240177,
"learning_rate": 2.2295838873239965e-05,
"loss": 0.4296,
"mean_token_accuracy": 0.8604548752307892,
"num_tokens": 991061372.0,
"step": 1080
},
{
"epoch": 1.541722207411357,
"grad_norm": 0.23963844839444817,
"learning_rate": 2.194230839515425e-05,
"loss": 0.4336,
"mean_token_accuracy": 0.8584208697080612,
"num_tokens": 995660319.0,
"step": 1085
},
{
"epoch": 1.5488314227317161,
"grad_norm": 0.24314988814533856,
"learning_rate": 2.1593155578623702e-05,
"loss": 0.4306,
"mean_token_accuracy": 0.8601135425269604,
"num_tokens": 1000236933.0,
"step": 1090
},
{
"epoch": 1.555940638052075,
"grad_norm": 0.2566886574453899,
"learning_rate": 2.1248426660175713e-05,
"loss": 0.4384,
"mean_token_accuracy": 0.8573588460683823,
"num_tokens": 1004820862.0,
"step": 1095
},
{
"epoch": 1.563049853372434,
"grad_norm": 0.2621075128506793,
"learning_rate": 2.0908167290503326e-05,
"loss": 0.4298,
"mean_token_accuracy": 0.8607131637632847,
"num_tokens": 1009411521.0,
"step": 1100
},
{
"epoch": 1.563049853372434,
"eval_loss": 0.4672245681285858,
"eval_mean_token_accuracy": 0.844007690747579,
"eval_num_tokens": 1009411521.0,
"eval_runtime": 146.3617,
"eval_samples_per_second": 24.856,
"eval_steps_per_second": 0.779,
"step": 1100
},
{
"epoch": 1.570159068692793,
"grad_norm": 0.23570827346042514,
"learning_rate": 2.0572422528420095e-05,
"loss": 0.4206,
"mean_token_accuracy": 0.8622309692203999,
"num_tokens": 1013995376.0,
"step": 1105
},
{
"epoch": 1.577268284013152,
"grad_norm": 0.28786088105829327,
"learning_rate": 2.024123683489303e-05,
"loss": 0.4195,
"mean_token_accuracy": 0.8634026922285557,
"num_tokens": 1018562407.0,
"step": 1110
},
{
"epoch": 1.584377499333511,
"grad_norm": 0.22477409346403396,
"learning_rate": 1.9914654067154996e-05,
"loss": 0.4345,
"mean_token_accuracy": 0.8584335811436177,
"num_tokens": 1023168118.0,
"step": 1115
},
{
"epoch": 1.59148671465387,
"grad_norm": 0.24599345473106599,
"learning_rate": 1.959271747289686e-05,
"loss": 0.4278,
"mean_token_accuracy": 0.8616135574877262,
"num_tokens": 1027754848.0,
"step": 1120
},
{
"epoch": 1.5985959299742292,
"grad_norm": 0.24491593894054278,
"learning_rate": 1.9275469684540404e-05,
"loss": 0.4294,
"mean_token_accuracy": 0.8590353332459927,
"num_tokens": 1032347251.0,
"step": 1125
},
{
"epoch": 1.605705145294588,
"grad_norm": 0.2540751338276317,
"learning_rate": 1.8962952713592752e-05,
"loss": 0.4242,
"mean_token_accuracy": 0.8608104437589645,
"num_tokens": 1036931829.0,
"step": 1130
},
{
"epoch": 1.612814360614947,
"grad_norm": 0.2510287685288083,
"learning_rate": 1.8655207945083e-05,
"loss": 0.4239,
"mean_token_accuracy": 0.8617179103195667,
"num_tokens": 1041532224.0,
"step": 1135
},
{
"epoch": 1.6199235759353061,
"grad_norm": 0.2693350827409704,
"learning_rate": 1.8352276132081847e-05,
"loss": 0.4357,
"mean_token_accuracy": 0.8589904353022575,
"num_tokens": 1046120676.0,
"step": 1140
},
{
"epoch": 1.6270327912556652,
"grad_norm": 0.24443054034299724,
"learning_rate": 1.8054197390304755e-05,
"loss": 0.4275,
"mean_token_accuracy": 0.8615889854729175,
"num_tokens": 1050708153.0,
"step": 1145
},
{
"epoch": 1.6341420065760242,
"grad_norm": 0.24588007040764026,
"learning_rate": 1.7761011192799764e-05,
"loss": 0.4238,
"mean_token_accuracy": 0.8622479006648064,
"num_tokens": 1055294826.0,
"step": 1150
},
{
"epoch": 1.641251221896383,
"grad_norm": 0.24561473837992528,
"learning_rate": 1.7472756364720206e-05,
"loss": 0.4243,
"mean_token_accuracy": 0.8616314500570297,
"num_tokens": 1059896792.0,
"step": 1155
},
{
"epoch": 1.6483604372167422,
"grad_norm": 0.23202476301237993,
"learning_rate": 1.7189471078183302e-05,
"loss": 0.4313,
"mean_token_accuracy": 0.860023857653141,
"num_tokens": 1064504870.0,
"step": 1160
},
{
"epoch": 1.6554696525371013,
"grad_norm": 0.2403111932989795,
"learning_rate": 1.6911192847215225e-05,
"loss": 0.4315,
"mean_token_accuracy": 0.85991101115942,
"num_tokens": 1069092813.0,
"step": 1165
},
{
"epoch": 1.6625788678574602,
"grad_norm": 0.23285052418281263,
"learning_rate": 1.6637958522783298e-05,
"loss": 0.4286,
"mean_token_accuracy": 0.8603983536362648,
"num_tokens": 1073673087.0,
"step": 1170
},
{
"epoch": 1.6696880831778191,
"grad_norm": 0.23644436345090544,
"learning_rate": 1.6369804287916028e-05,
"loss": 0.4237,
"mean_token_accuracy": 0.8625174552202225,
"num_tokens": 1078263989.0,
"step": 1175
},
{
"epoch": 1.6767972984981783,
"grad_norm": 0.2283809036559784,
"learning_rate": 1.6106765652911563e-05,
"loss": 0.4196,
"mean_token_accuracy": 0.8629219397902489,
"num_tokens": 1082858600.0,
"step": 1180
},
{
"epoch": 1.6839065138185374,
"grad_norm": 0.2437421457507895,
"learning_rate": 1.5848877450635237e-05,
"loss": 0.431,
"mean_token_accuracy": 0.8596989519894123,
"num_tokens": 1087463215.0,
"step": 1185
},
{
"epoch": 1.6910157291388963,
"grad_norm": 0.24997191755310427,
"learning_rate": 1.559617383190684e-05,
"loss": 0.4258,
"mean_token_accuracy": 0.8600839108228684,
"num_tokens": 1092046691.0,
"step": 1190
},
{
"epoch": 1.6981249444592552,
"grad_norm": 0.24275510902589129,
"learning_rate": 1.5348688260978188e-05,
"loss": 0.4198,
"mean_token_accuracy": 0.8634254619479179,
"num_tokens": 1096635412.0,
"step": 1195
},
{
"epoch": 1.7052341597796143,
"grad_norm": 0.25771028141912433,
"learning_rate": 1.5106453511101657e-05,
"loss": 0.4198,
"mean_token_accuracy": 0.8630197443068027,
"num_tokens": 1101239957.0,
"step": 1200
},
{
"epoch": 1.7052341597796143,
"eval_loss": 0.4617161452770233,
"eval_mean_token_accuracy": 0.8460459296117749,
"eval_num_tokens": 1101239957.0,
"eval_runtime": 143.0225,
"eval_samples_per_second": 25.437,
"eval_steps_per_second": 0.797,
"step": 1200
},
{
"epoch": 1.7123433750999735,
"grad_norm": 0.2465846462175401,
"learning_rate": 1.4869501660190118e-05,
"loss": 0.4269,
"mean_token_accuracy": 0.8613091327250004,
"num_tokens": 1105835727.0,
"step": 1205
},
{
"epoch": 1.7194525904203324,
"grad_norm": 0.24343231445496366,
"learning_rate": 1.4637864086569114e-05,
"loss": 0.4189,
"mean_token_accuracy": 0.8625466778874398,
"num_tokens": 1110431832.0,
"step": 1210
},
{
"epoch": 1.7265618057406913,
"grad_norm": 0.24500024608031826,
"learning_rate": 1.4411571464821522e-05,
"loss": 0.4178,
"mean_token_accuracy": 0.8632443450391293,
"num_tokens": 1115003545.0,
"step": 1215
},
{
"epoch": 1.7336710210610504,
"grad_norm": 0.24384954499049283,
"learning_rate": 1.4190653761725458e-05,
"loss": 0.4331,
"mean_token_accuracy": 0.8595723591744899,
"num_tokens": 1119594038.0,
"step": 1220
},
{
"epoch": 1.7407802363814096,
"grad_norm": 0.24988962843301607,
"learning_rate": 1.3975140232286033e-05,
"loss": 0.4292,
"mean_token_accuracy": 0.8610283821821213,
"num_tokens": 1124191272.0,
"step": 1225
},
{
"epoch": 1.7478894517017685,
"grad_norm": 0.23666630913921613,
"learning_rate": 1.3765059415861142e-05,
"loss": 0.4256,
"mean_token_accuracy": 0.8612963631749153,
"num_tokens": 1128787024.0,
"step": 1230
},
{
"epoch": 1.7549986670221274,
"grad_norm": 0.24377997978707636,
"learning_rate": 1.3560439132382218e-05,
"loss": 0.4249,
"mean_token_accuracy": 0.8616208277642727,
"num_tokens": 1133369468.0,
"step": 1235
},
{
"epoch": 1.7621078823424865,
"grad_norm": 0.24473326280197544,
"learning_rate": 1.336130647867015e-05,
"loss": 0.4233,
"mean_token_accuracy": 0.8611096739768982,
"num_tokens": 1137960753.0,
"step": 1240
},
{
"epoch": 1.7692170976628456,
"grad_norm": 0.2814923829698822,
"learning_rate": 1.3167687824846988e-05,
"loss": 0.4345,
"mean_token_accuracy": 0.8590093135833741,
"num_tokens": 1142557989.0,
"step": 1245
},
{
"epoch": 1.7763263129832043,
"grad_norm": 0.24671237642090413,
"learning_rate": 1.297960881084391e-05,
"loss": 0.4136,
"mean_token_accuracy": 0.8641826197504997,
"num_tokens": 1147139033.0,
"step": 1250
},
{
"epoch": 1.7834355283035634,
"grad_norm": 0.23802525665842986,
"learning_rate": 1.2797094343005807e-05,
"loss": 0.4212,
"mean_token_accuracy": 0.8627298250794411,
"num_tokens": 1151728912.0,
"step": 1255
},
{
"epoch": 1.7905447436239226,
"grad_norm": 0.24514167574215462,
"learning_rate": 1.2620168590793105e-05,
"loss": 0.4243,
"mean_token_accuracy": 0.8623115479946136,
"num_tokens": 1156315343.0,
"step": 1260
},
{
"epoch": 1.7976539589442815,
"grad_norm": 0.24177052216503225,
"learning_rate": 1.2448854983581134e-05,
"loss": 0.4205,
"mean_token_accuracy": 0.8636125177145004,
"num_tokens": 1160905222.0,
"step": 1265
},
{
"epoch": 1.8047631742646404,
"grad_norm": 0.25623340057701793,
"learning_rate": 1.2283176207557455e-05,
"loss": 0.4204,
"mean_token_accuracy": 0.863289151340723,
"num_tokens": 1165469584.0,
"step": 1270
},
{
"epoch": 1.8118723895849995,
"grad_norm": 0.2366529819101992,
"learning_rate": 1.2123154202717656e-05,
"loss": 0.4205,
"mean_token_accuracy": 0.8623673833906651,
"num_tokens": 1170087058.0,
"step": 1275
},
{
"epoch": 1.8189816049053587,
"grad_norm": 0.23815408906221286,
"learning_rate": 1.1968810159959982e-05,
"loss": 0.4167,
"mean_token_accuracy": 0.8636409521102906,
"num_tokens": 1174675450.0,
"step": 1280
},
{
"epoch": 1.8260908202257176,
"grad_norm": 0.25161717096488057,
"learning_rate": 1.1820164518279083e-05,
"loss": 0.4308,
"mean_token_accuracy": 0.8603747352957726,
"num_tokens": 1179252086.0,
"step": 1285
},
{
"epoch": 1.8332000355460765,
"grad_norm": 0.23828924023109987,
"learning_rate": 1.1677236962059421e-05,
"loss": 0.4161,
"mean_token_accuracy": 0.8636845953762531,
"num_tokens": 1183846581.0,
"step": 1290
},
{
"epoch": 1.8403092508664356,
"grad_norm": 0.2389439298878492,
"learning_rate": 1.1540046418468561e-05,
"loss": 0.4093,
"mean_token_accuracy": 0.8666847251355648,
"num_tokens": 1188439447.0,
"step": 1295
},
{
"epoch": 1.8474184661867947,
"grad_norm": 0.26036762406039,
"learning_rate": 1.1408611054950722e-05,
"loss": 0.4187,
"mean_token_accuracy": 0.8630855195224285,
"num_tokens": 1193031482.0,
"step": 1300
},
{
"epoch": 1.8474184661867947,
"eval_loss": 0.45738622546195984,
"eval_mean_token_accuracy": 0.847679163803134,
"eval_num_tokens": 1193031482.0,
"eval_runtime": 143.6355,
"eval_samples_per_second": 25.328,
"eval_steps_per_second": 0.794,
"step": 1300
},
{
"epoch": 1.8545276815071536,
"grad_norm": 0.2419491832206913,
"learning_rate": 1.1282948276820963e-05,
"loss": 0.4223,
"mean_token_accuracy": 0.8626484178006649,
"num_tokens": 1197621510.0,
"step": 1305
},
{
"epoch": 1.8616368968275125,
"grad_norm": 0.2366717377397619,
"learning_rate": 1.1163074724960326e-05,
"loss": 0.4202,
"mean_token_accuracy": 0.8629304811358451,
"num_tokens": 1202214988.0,
"step": 1310
},
{
"epoch": 1.8687461121478717,
"grad_norm": 0.24750576690261594,
"learning_rate": 1.10490062736121e-05,
"loss": 0.4159,
"mean_token_accuracy": 0.8640658937394619,
"num_tokens": 1206801749.0,
"step": 1315
},
{
"epoch": 1.8758553274682308,
"grad_norm": 0.2754980560042937,
"learning_rate": 1.094075802827971e-05,
"loss": 0.4224,
"mean_token_accuracy": 0.8619605071842671,
"num_tokens": 1211394066.0,
"step": 1320
},
{
"epoch": 1.8829645427885897,
"grad_norm": 0.2441756409539309,
"learning_rate": 1.0838344323726395e-05,
"loss": 0.4159,
"mean_token_accuracy": 0.8641899891197682,
"num_tokens": 1215982389.0,
"step": 1325
},
{
"epoch": 1.8900737581089486,
"grad_norm": 0.25017331261640485,
"learning_rate": 1.0741778722076896e-05,
"loss": 0.4141,
"mean_token_accuracy": 0.864534319192171,
"num_tokens": 1220561480.0,
"step": 1330
},
{
"epoch": 1.8971829734293078,
"grad_norm": 0.24928323459761015,
"learning_rate": 1.0651074011021495e-05,
"loss": 0.4148,
"mean_token_accuracy": 0.8647311642765999,
"num_tokens": 1225151015.0,
"step": 1335
},
{
"epoch": 1.9042921887496669,
"grad_norm": 0.26117744577378244,
"learning_rate": 1.056624220212263e-05,
"loss": 0.4227,
"mean_token_accuracy": 0.8627439729869366,
"num_tokens": 1229753553.0,
"step": 1340
},
{
"epoch": 1.9114014040700258,
"grad_norm": 0.250926981430339,
"learning_rate": 1.048729452922423e-05,
"loss": 0.4118,
"mean_token_accuracy": 0.8654024370014668,
"num_tokens": 1234324722.0,
"step": 1345
},
{
"epoch": 1.9185106193903847,
"grad_norm": 0.26445464932369295,
"learning_rate": 1.0414241446964102e-05,
"loss": 0.4176,
"mean_token_accuracy": 0.8638374984264374,
"num_tokens": 1238945254.0,
"step": 1350
},
{
"epoch": 1.9256198347107438,
"grad_norm": 0.24942959940503223,
"learning_rate": 1.0347092629389484e-05,
"loss": 0.4098,
"mean_token_accuracy": 0.8681537143886089,
"num_tokens": 1243530120.0,
"step": 1355
},
{
"epoch": 1.932729050031103,
"grad_norm": 0.25517475920539473,
"learning_rate": 1.0285856968675917e-05,
"loss": 0.4104,
"mean_token_accuracy": 0.8657238759100437,
"num_tokens": 1248126495.0,
"step": 1360
},
{
"epoch": 1.9398382653514619,
"grad_norm": 0.24624704699692396,
"learning_rate": 1.0230542573949747e-05,
"loss": 0.4053,
"mean_token_accuracy": 0.8677756235003471,
"num_tokens": 1252728208.0,
"step": 1365
},
{
"epoch": 1.9469474806718208,
"grad_norm": 0.24811417447193737,
"learning_rate": 1.0181156770214243e-05,
"loss": 0.4193,
"mean_token_accuracy": 0.8637429274618625,
"num_tokens": 1257314007.0,
"step": 1370
},
{
"epoch": 1.95405669599218,
"grad_norm": 0.2553291480205661,
"learning_rate": 1.013770609737961e-05,
"loss": 0.4153,
"mean_token_accuracy": 0.8649327427148819,
"num_tokens": 1261908378.0,
"step": 1375
},
{
"epoch": 1.961165911312539,
"grad_norm": 0.24846642652489853,
"learning_rate": 1.010019630939691e-05,
"loss": 0.4204,
"mean_token_accuracy": 0.8626691080629826,
"num_tokens": 1266492690.0,
"step": 1380
},
{
"epoch": 1.968275126632898,
"grad_norm": 0.24853442428779762,
"learning_rate": 1.0068632373496125e-05,
"loss": 0.4213,
"mean_token_accuracy": 0.862095658481121,
"num_tokens": 1271089050.0,
"step": 1385
},
{
"epoch": 1.9753843419532569,
"grad_norm": 0.25447008393745496,
"learning_rate": 1.0043018469528365e-05,
"loss": 0.4186,
"mean_token_accuracy": 0.8638553529977798,
"num_tokens": 1275693685.0,
"step": 1390
},
{
"epoch": 1.982493557273616,
"grad_norm": 0.25146974784680387,
"learning_rate": 1.0023357989412332e-05,
"loss": 0.4132,
"mean_token_accuracy": 0.8654829584062099,
"num_tokens": 1280282291.0,
"step": 1395
},
{
"epoch": 1.9896027725939749,
"grad_norm": 0.25186861166219776,
"learning_rate": 1.000965353668517e-05,
"loss": 0.4097,
"mean_token_accuracy": 0.8660168826580048,
"num_tokens": 1284878893.0,
"step": 1400
},
{
"epoch": 1.9896027725939749,
"eval_loss": 0.45450538396835327,
"eval_mean_token_accuracy": 0.8486974662856052,
"eval_num_tokens": 1284878893.0,
"eval_runtime": 143.4865,
"eval_samples_per_second": 25.354,
"eval_steps_per_second": 0.794,
"step": 1400
},
{
"epoch": 1.9967119879143338,
"grad_norm": 0.2548741967506241,
"learning_rate": 1.0001906926157681e-05,
"loss": 0.4088,
"mean_token_accuracy": 0.8670746453106404,
"num_tokens": 1289465244.0,
"step": 1405
},
{
"epoch": 2.0,
"mean_token_accuracy": 0.8681698522052249,
"num_tokens": 1291584473.0,
"step": 1408,
"total_flos": 9795365997903872.0,
"train_loss": 0.5166227378120477,
"train_runtime": 48333.5779,
"train_samples_per_second": 14.899,
"train_steps_per_second": 0.029
}
],
"logging_steps": 5,
"max_steps": 1408,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9795365997903872.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}