{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01556420233463035, "grad_norm": 32.460037697908916, "learning_rate": 1.25e-06, "loss": 2.0427, "step": 1 }, { "epoch": 0.0311284046692607, "grad_norm": 14.314919163607806, "learning_rate": 2.5e-06, "loss": 1.8704, "step": 2 }, { "epoch": 0.04669260700389105, "grad_norm": 27.137073787955334, "learning_rate": 3.7500000000000005e-06, "loss": 2.009, "step": 3 }, { "epoch": 0.0622568093385214, "grad_norm": 22.855686347208184, "learning_rate": 5e-06, "loss": 2.2908, "step": 4 }, { "epoch": 0.07782101167315175, "grad_norm": 17.021052812960836, "learning_rate": 6.25e-06, "loss": 1.6062, "step": 5 }, { "epoch": 0.0933852140077821, "grad_norm": 8.04115371265731, "learning_rate": 7.500000000000001e-06, "loss": 1.6328, "step": 6 }, { "epoch": 0.10894941634241245, "grad_norm": 8.684988862656194, "learning_rate": 8.750000000000001e-06, "loss": 1.5791, "step": 7 }, { "epoch": 0.1245136186770428, "grad_norm": 10.489704115967008, "learning_rate": 1e-05, "loss": 1.437, "step": 8 }, { "epoch": 0.14007782101167315, "grad_norm": 9.345852547867393, "learning_rate": 1.125e-05, "loss": 1.4409, "step": 9 }, { "epoch": 0.1556420233463035, "grad_norm": 12.566418385513145, "learning_rate": 1.25e-05, "loss": 1.9836, "step": 10 }, { "epoch": 0.17120622568093385, "grad_norm": 9.458631541156663, "learning_rate": 1.375e-05, "loss": 1.7625, "step": 11 }, { "epoch": 0.1867704280155642, "grad_norm": 4.775804904067164, "learning_rate": 1.5000000000000002e-05, "loss": 1.3358, "step": 12 }, { "epoch": 0.20233463035019456, "grad_norm": 7.063116954030999, "learning_rate": 1.6250000000000002e-05, "loss": 1.5127, "step": 13 }, { "epoch": 0.2178988326848249, "grad_norm": 20.081862895733657, "learning_rate": 1.7500000000000002e-05, "loss": 1.6345, "step": 14 }, { "epoch": 0.23346303501945526, "grad_norm": 7.4857791672008664, "learning_rate": 1.8750000000000002e-05, "loss": 1.6543, "step": 15 }, { "epoch": 0.2490272373540856, "grad_norm": 5.930935178930693, "learning_rate": 2e-05, "loss": 1.3915, "step": 16 }, { "epoch": 0.26459143968871596, "grad_norm": 4.891425231471704, "learning_rate": 1.9999819470801393e-05, "loss": 1.4524, "step": 17 }, { "epoch": 0.2801556420233463, "grad_norm": 8.526785651246044, "learning_rate": 1.999927789044796e-05, "loss": 1.3936, "step": 18 }, { "epoch": 0.29571984435797666, "grad_norm": 8.794855084727713, "learning_rate": 1.9998375280666606e-05, "loss": 1.4946, "step": 19 }, { "epoch": 0.311284046692607, "grad_norm": 5.5605631475331, "learning_rate": 1.9997111677667875e-05, "loss": 1.5168, "step": 20 }, { "epoch": 0.32684824902723736, "grad_norm": 6.109413863257978, "learning_rate": 1.999548713214448e-05, "loss": 1.3959, "step": 21 }, { "epoch": 0.3424124513618677, "grad_norm": 8.216498243755654, "learning_rate": 1.9993501709269297e-05, "loss": 1.5457, "step": 22 }, { "epoch": 0.35797665369649806, "grad_norm": 4.575471414036173, "learning_rate": 1.9991155488692714e-05, "loss": 1.3057, "step": 23 }, { "epoch": 0.3735408560311284, "grad_norm": 7.713781786096019, "learning_rate": 1.9988448564539475e-05, "loss": 1.52, "step": 24 }, { "epoch": 0.38910505836575876, "grad_norm": 9.333764622766337, "learning_rate": 1.998538104540488e-05, "loss": 1.2957, "step": 25 }, { "epoch": 0.4046692607003891, "grad_norm": 6.153987069403218, "learning_rate": 1.9981953054350436e-05, "loss": 1.2496, "step": 26 }, { "epoch": 0.42023346303501946, "grad_norm": 7.204269939964425, "learning_rate": 1.997816472889891e-05, "loss": 1.2556, "step": 27 }, { "epoch": 0.4357976653696498, "grad_norm": 9.356556657699878, "learning_rate": 1.9974016221028825e-05, "loss": 1.3693, "step": 28 }, { "epoch": 0.45136186770428016, "grad_norm": 18.674995074165327, "learning_rate": 1.9969507697168372e-05, "loss": 1.3901, "step": 29 }, { "epoch": 0.4669260700389105, "grad_norm": 8.743718086448856, "learning_rate": 1.996463933818869e-05, "loss": 1.4637, "step": 30 }, { "epoch": 0.48249027237354086, "grad_norm": 6.036142649788235, "learning_rate": 1.9959411339396667e-05, "loss": 1.446, "step": 31 }, { "epoch": 0.4980544747081712, "grad_norm": 11.118098752442311, "learning_rate": 1.9953823910527057e-05, "loss": 1.2325, "step": 32 }, { "epoch": 0.5136186770428015, "grad_norm": 8.532563003893063, "learning_rate": 1.9947877275734103e-05, "loss": 1.4182, "step": 33 }, { "epoch": 0.5291828793774319, "grad_norm": 8.44093940432976, "learning_rate": 1.9941571673582517e-05, "loss": 1.519, "step": 34 }, { "epoch": 0.5447470817120622, "grad_norm": 10.678108587266333, "learning_rate": 1.9934907357037913e-05, "loss": 1.4811, "step": 35 }, { "epoch": 0.5603112840466926, "grad_norm": 12.279223122144039, "learning_rate": 1.992788459345669e-05, "loss": 1.5383, "step": 36 }, { "epoch": 0.5758754863813229, "grad_norm": 9.6225514739576, "learning_rate": 1.9920503664575252e-05, "loss": 1.634, "step": 37 }, { "epoch": 0.5914396887159533, "grad_norm": 6.816063682489534, "learning_rate": 1.991276486649876e-05, "loss": 1.4922, "step": 38 }, { "epoch": 0.6070038910505836, "grad_norm": 8.462345745450083, "learning_rate": 1.990466850968921e-05, "loss": 1.4553, "step": 39 }, { "epoch": 0.622568093385214, "grad_norm": 7.511393821964174, "learning_rate": 1.9896214918953003e-05, "loss": 1.613, "step": 40 }, { "epoch": 0.6381322957198443, "grad_norm": 12.76100865490119, "learning_rate": 1.9887404433427917e-05, "loss": 1.3829, "step": 41 }, { "epoch": 0.6536964980544747, "grad_norm": 9.953708209183798, "learning_rate": 1.9878237406569476e-05, "loss": 1.2018, "step": 42 }, { "epoch": 0.669260700389105, "grad_norm": 17.646408078968005, "learning_rate": 1.9868714206136787e-05, "loss": 1.2234, "step": 43 }, { "epoch": 0.6848249027237354, "grad_norm": 6.687546724266615, "learning_rate": 1.985883521417781e-05, "loss": 1.3619, "step": 44 }, { "epoch": 0.7003891050583657, "grad_norm": 7.265965326318142, "learning_rate": 1.9848600827013976e-05, "loss": 1.3766, "step": 45 }, { "epoch": 0.7159533073929961, "grad_norm": 10.47550352191459, "learning_rate": 1.983801145522434e-05, "loss": 1.4978, "step": 46 }, { "epoch": 0.7315175097276264, "grad_norm": 7.719719591865551, "learning_rate": 1.9827067523629075e-05, "loss": 1.5026, "step": 47 }, { "epoch": 0.7470817120622568, "grad_norm": 6.653131519406193, "learning_rate": 1.981576947127245e-05, "loss": 1.3853, "step": 48 }, { "epoch": 0.7626459143968871, "grad_norm": 8.90485792562694, "learning_rate": 1.9804117751405213e-05, "loss": 1.4688, "step": 49 }, { "epoch": 0.7782101167315175, "grad_norm": 7.409558877721469, "learning_rate": 1.9792112831466385e-05, "loss": 1.2318, "step": 50 }, { "epoch": 0.7937743190661478, "grad_norm": 10.548282670671158, "learning_rate": 1.9779755193064545e-05, "loss": 1.1853, "step": 51 }, { "epoch": 0.8093385214007782, "grad_norm": 8.195819265190133, "learning_rate": 1.9767045331958486e-05, "loss": 1.3867, "step": 52 }, { "epoch": 0.8249027237354085, "grad_norm": 7.195118313650143, "learning_rate": 1.9753983758037324e-05, "loss": 1.3728, "step": 53 }, { "epoch": 0.8404669260700389, "grad_norm": 6.865553942117972, "learning_rate": 1.9740570995300054e-05, "loss": 1.4971, "step": 54 }, { "epoch": 0.8560311284046692, "grad_norm": 13.711210401882385, "learning_rate": 1.9726807581834522e-05, "loss": 1.5261, "step": 55 }, { "epoch": 0.8715953307392996, "grad_norm": 5.9709078674205776, "learning_rate": 1.971269406979584e-05, "loss": 1.1547, "step": 56 }, { "epoch": 0.8871595330739299, "grad_norm": 7.305358527853627, "learning_rate": 1.9698231025384234e-05, "loss": 1.2522, "step": 57 }, { "epoch": 0.9027237354085603, "grad_norm": 7.144056189221613, "learning_rate": 1.9683419028822333e-05, "loss": 1.2921, "step": 58 }, { "epoch": 0.9182879377431906, "grad_norm": 6.9606269877032005, "learning_rate": 1.9668258674331882e-05, "loss": 1.2708, "step": 59 }, { "epoch": 0.933852140077821, "grad_norm": 9.722878510396836, "learning_rate": 1.9652750570109914e-05, "loss": 1.4427, "step": 60 }, { "epoch": 0.9494163424124513, "grad_norm": 9.963909818863343, "learning_rate": 1.9636895338304347e-05, "loss": 1.3381, "step": 61 }, { "epoch": 0.9649805447470817, "grad_norm": 6.446954560778193, "learning_rate": 1.9620693614989024e-05, "loss": 1.4269, "step": 62 }, { "epoch": 0.980544747081712, "grad_norm": 7.603247343231994, "learning_rate": 1.9604146050138194e-05, "loss": 1.4298, "step": 63 }, { "epoch": 0.9961089494163424, "grad_norm": 6.349264800380979, "learning_rate": 1.958725330760044e-05, "loss": 1.0356, "step": 64 }, { "epoch": 1.0, "grad_norm": 6.349264800380979, "learning_rate": 1.9570016065072047e-05, "loss": 0.3835, "step": 65 }, { "epoch": 1.0155642023346303, "grad_norm": 8.467306321422342, "learning_rate": 1.9552435014069805e-05, "loss": 1.329, "step": 66 }, { "epoch": 1.0311284046692606, "grad_norm": 8.408957527442329, "learning_rate": 1.953451085990329e-05, "loss": 1.2351, "step": 67 }, { "epoch": 1.046692607003891, "grad_norm": 13.406884070709882, "learning_rate": 1.9516244321646533e-05, "loss": 1.6201, "step": 68 }, { "epoch": 1.0622568093385214, "grad_norm": 6.37594555257866, "learning_rate": 1.9497636132109208e-05, "loss": 1.3477, "step": 69 }, { "epoch": 1.0778210116731517, "grad_norm": 6.084035911845763, "learning_rate": 1.9478687037807215e-05, "loss": 1.1285, "step": 70 }, { "epoch": 1.0933852140077822, "grad_norm": 5.938233548564638, "learning_rate": 1.9459397798932732e-05, "loss": 1.2728, "step": 71 }, { "epoch": 1.1089494163424125, "grad_norm": 4.999727686110884, "learning_rate": 1.9439769189323727e-05, "loss": 1.2983, "step": 72 }, { "epoch": 1.1245136186770428, "grad_norm": 7.546246025502454, "learning_rate": 1.9419801996432896e-05, "loss": 1.2559, "step": 73 }, { "epoch": 1.140077821011673, "grad_norm": 5.50857701371075, "learning_rate": 1.9399497021296094e-05, "loss": 1.2612, "step": 74 }, { "epoch": 1.1556420233463034, "grad_norm": 19.755888394433814, "learning_rate": 1.937885507850018e-05, "loss": 1.3324, "step": 75 }, { "epoch": 1.171206225680934, "grad_norm": 7.035191415935576, "learning_rate": 1.935787699615036e-05, "loss": 1.3674, "step": 76 }, { "epoch": 1.1867704280155642, "grad_norm": 6.756030563749288, "learning_rate": 1.933656361583694e-05, "loss": 1.183, "step": 77 }, { "epoch": 1.2023346303501945, "grad_norm": 8.841867408302369, "learning_rate": 1.931491579260158e-05, "loss": 1.2485, "step": 78 }, { "epoch": 1.217898832684825, "grad_norm": 5.387317154730147, "learning_rate": 1.9292934394902992e-05, "loss": 1.5021, "step": 79 }, { "epoch": 1.2334630350194553, "grad_norm": 14.871501309985415, "learning_rate": 1.9270620304582077e-05, "loss": 1.5676, "step": 80 }, { "epoch": 1.2490272373540856, "grad_norm": 10.997862717881436, "learning_rate": 1.9247974416826585e-05, "loss": 1.3707, "step": 81 }, { "epoch": 1.264591439688716, "grad_norm": 7.2507989168336735, "learning_rate": 1.922499764013518e-05, "loss": 1.5842, "step": 82 }, { "epoch": 1.2801556420233462, "grad_norm": 9.537768676771838, "learning_rate": 1.920169089628099e-05, "loss": 1.4067, "step": 83 }, { "epoch": 1.2957198443579767, "grad_norm": 8.163551805381035, "learning_rate": 1.9178055120274625e-05, "loss": 1.2465, "step": 84 }, { "epoch": 1.311284046692607, "grad_norm": 6.640431551160505, "learning_rate": 1.9154091260326698e-05, "loss": 1.3956, "step": 85 }, { "epoch": 1.3268482490272373, "grad_norm": 26.20900323475498, "learning_rate": 1.9129800277809742e-05, "loss": 1.2936, "step": 86 }, { "epoch": 1.3424124513618678, "grad_norm": 9.219414651764641, "learning_rate": 1.910518314721967e-05, "loss": 1.5675, "step": 87 }, { "epoch": 1.3579766536964981, "grad_norm": 6.169958007250397, "learning_rate": 1.9080240856136675e-05, "loss": 1.51, "step": 88 }, { "epoch": 1.3735408560311284, "grad_norm": 7.775254406585112, "learning_rate": 1.9054974405185605e-05, "loss": 1.5837, "step": 89 }, { "epoch": 1.3891050583657587, "grad_norm": 6.000524431156958, "learning_rate": 1.902938480799583e-05, "loss": 1.239, "step": 90 }, { "epoch": 1.404669260700389, "grad_norm": 6.445928086057122, "learning_rate": 1.9003473091160557e-05, "loss": 1.2915, "step": 91 }, { "epoch": 1.4202334630350195, "grad_norm": 8.175071297255242, "learning_rate": 1.8977240294195676e-05, "loss": 1.5306, "step": 92 }, { "epoch": 1.4357976653696498, "grad_norm": 5.720210778577455, "learning_rate": 1.895068746949803e-05, "loss": 1.3159, "step": 93 }, { "epoch": 1.45136186770428, "grad_norm": 9.833046906184748, "learning_rate": 1.8923815682303214e-05, "loss": 1.553, "step": 94 }, { "epoch": 1.4669260700389106, "grad_norm": 6.851775208018755, "learning_rate": 1.8896626010642833e-05, "loss": 1.5885, "step": 95 }, { "epoch": 1.482490272373541, "grad_norm": 7.1890477833480295, "learning_rate": 1.886911954530124e-05, "loss": 1.5322, "step": 96 }, { "epoch": 1.4980544747081712, "grad_norm": 8.094895085879692, "learning_rate": 1.884129738977181e-05, "loss": 1.326, "step": 97 }, { "epoch": 1.5136186770428015, "grad_norm": 9.604855227288974, "learning_rate": 1.8813160660212636e-05, "loss": 1.4432, "step": 98 }, { "epoch": 1.5291828793774318, "grad_norm": 10.150820230299194, "learning_rate": 1.8784710485401775e-05, "loss": 1.2174, "step": 99 }, { "epoch": 1.544747081712062, "grad_norm": 9.238169258331935, "learning_rate": 1.875594800669195e-05, "loss": 1.2617, "step": 100 }, { "epoch": 1.5603112840466926, "grad_norm": 9.441159749907698, "learning_rate": 1.8726874377964764e-05, "loss": 1.234, "step": 101 }, { "epoch": 1.575875486381323, "grad_norm": 7.2474587450825, "learning_rate": 1.869749076558442e-05, "loss": 1.1766, "step": 102 }, { "epoch": 1.5914396887159534, "grad_norm": 6.23841795541216, "learning_rate": 1.8667798348350918e-05, "loss": 1.2045, "step": 103 }, { "epoch": 1.6070038910505837, "grad_norm": 6.554762465264489, "learning_rate": 1.863779831745276e-05, "loss": 1.1382, "step": 104 }, { "epoch": 1.622568093385214, "grad_norm": 5.997892924979543, "learning_rate": 1.8607491876419183e-05, "loss": 1.4172, "step": 105 }, { "epoch": 1.6381322957198443, "grad_norm": 6.371740787277364, "learning_rate": 1.8576880241071852e-05, "loss": 1.3726, "step": 106 }, { "epoch": 1.6536964980544746, "grad_norm": 7.86316264854559, "learning_rate": 1.8545964639476105e-05, "loss": 1.3789, "step": 107 }, { "epoch": 1.669260700389105, "grad_norm": 7.417730516932254, "learning_rate": 1.851474631189167e-05, "loss": 1.2634, "step": 108 }, { "epoch": 1.6848249027237354, "grad_norm": 8.222005228354707, "learning_rate": 1.848322651072291e-05, "loss": 1.4934, "step": 109 }, { "epoch": 1.7003891050583657, "grad_norm": 6.747214231413209, "learning_rate": 1.8451406500468598e-05, "loss": 1.2275, "step": 110 }, { "epoch": 1.7159533073929962, "grad_norm": 7.522875546614491, "learning_rate": 1.841928755767116e-05, "loss": 1.4232, "step": 111 }, { "epoch": 1.7315175097276265, "grad_norm": 8.395027504519643, "learning_rate": 1.8386870970865488e-05, "loss": 1.4617, "step": 112 }, { "epoch": 1.7470817120622568, "grad_norm": 5.682565677158726, "learning_rate": 1.835415804052724e-05, "loss": 1.2782, "step": 113 }, { "epoch": 1.7626459143968871, "grad_norm": 17.319126618522, "learning_rate": 1.8321150079020656e-05, "loss": 1.2252, "step": 114 }, { "epoch": 1.7782101167315174, "grad_norm": 6.698269746955856, "learning_rate": 1.8287848410545922e-05, "loss": 1.2638, "step": 115 }, { "epoch": 1.7937743190661477, "grad_norm": 5.389193784149641, "learning_rate": 1.825425437108605e-05, "loss": 1.2101, "step": 116 }, { "epoch": 1.8093385214007782, "grad_norm": 7.556872728377763, "learning_rate": 1.8220369308353255e-05, "loss": 1.4187, "step": 117 }, { "epoch": 1.8249027237354085, "grad_norm": 11.500517221563017, "learning_rate": 1.8186194581734922e-05, "loss": 1.2614, "step": 118 }, { "epoch": 1.840466926070039, "grad_norm": 6.973436659657028, "learning_rate": 1.815173156223906e-05, "loss": 1.6287, "step": 119 }, { "epoch": 1.8560311284046693, "grad_norm": 8.82430378736939, "learning_rate": 1.811698163243929e-05, "loss": 1.7683, "step": 120 }, { "epoch": 1.8715953307392996, "grad_norm": 7.44523008567814, "learning_rate": 1.8081946186419375e-05, "loss": 1.3718, "step": 121 }, { "epoch": 1.88715953307393, "grad_norm": 11.092035922405335, "learning_rate": 1.804662662971732e-05, "loss": 1.2244, "step": 122 }, { "epoch": 1.9027237354085602, "grad_norm": 8.907024665470365, "learning_rate": 1.801102437926896e-05, "loss": 1.1516, "step": 123 }, { "epoch": 1.9182879377431905, "grad_norm": 7.330064038520311, "learning_rate": 1.797514086335113e-05, "loss": 1.288, "step": 124 }, { "epoch": 1.933852140077821, "grad_norm": 10.980569189715007, "learning_rate": 1.7938977521524355e-05, "loss": 1.3815, "step": 125 }, { "epoch": 1.9494163424124513, "grad_norm": 8.792218570481833, "learning_rate": 1.79025358045751e-05, "loss": 1.5337, "step": 126 }, { "epoch": 1.9649805447470818, "grad_norm": 8.759608210372287, "learning_rate": 1.786581717445759e-05, "loss": 1.4971, "step": 127 }, { "epoch": 1.9805447470817121, "grad_norm": 6.719480961643205, "learning_rate": 1.782882310423512e-05, "loss": 1.3845, "step": 128 }, { "epoch": 1.9961089494163424, "grad_norm": 11.98512915914497, "learning_rate": 1.7791555078020992e-05, "loss": 1.8083, "step": 129 }, { "epoch": 2.0, "grad_norm": 11.98512915914497, "learning_rate": 1.7754014590918964e-05, "loss": 0.3816, "step": 130 }, { "epoch": 2.0155642023346303, "grad_norm": 6.076810203247505, "learning_rate": 1.771620314896327e-05, "loss": 1.1992, "step": 131 }, { "epoch": 2.0311284046692606, "grad_norm": 5.799331945733901, "learning_rate": 1.76781222690582e-05, "loss": 0.9603, "step": 132 }, { "epoch": 2.046692607003891, "grad_norm": 7.639810257162938, "learning_rate": 1.763977347891725e-05, "loss": 1.1609, "step": 133 }, { "epoch": 2.062256809338521, "grad_norm": 4.641223961792758, "learning_rate": 1.7601158317001835e-05, "loss": 1.4175, "step": 134 }, { "epoch": 2.077821011673152, "grad_norm": 6.543906827631292, "learning_rate": 1.756227833245956e-05, "loss": 1.1293, "step": 135 }, { "epoch": 2.093385214007782, "grad_norm": 5.807214604460152, "learning_rate": 1.752313508506208e-05, "loss": 1.2959, "step": 136 }, { "epoch": 2.1089494163424125, "grad_norm": 5.553563692538825, "learning_rate": 1.748373014514253e-05, "loss": 1.5164, "step": 137 }, { "epoch": 2.124513618677043, "grad_norm": 5.570141549794821, "learning_rate": 1.7444065093532507e-05, "loss": 1.223, "step": 138 }, { "epoch": 2.140077821011673, "grad_norm": 10.218543215792467, "learning_rate": 1.740414152149868e-05, "loss": 1.2693, "step": 139 }, { "epoch": 2.1556420233463034, "grad_norm": 6.654829769858401, "learning_rate": 1.736396103067893e-05, "loss": 1.2822, "step": 140 }, { "epoch": 2.1712062256809337, "grad_norm": 7.063879942686053, "learning_rate": 1.73235252330181e-05, "loss": 1.2313, "step": 141 }, { "epoch": 2.1867704280155644, "grad_norm": 8.26459321928057, "learning_rate": 1.728283575070333e-05, "loss": 1.3606, "step": 142 }, { "epoch": 2.2023346303501947, "grad_norm": 8.158351915542648, "learning_rate": 1.7241894216098995e-05, "loss": 1.1179, "step": 143 }, { "epoch": 2.217898832684825, "grad_norm": 6.025748582779371, "learning_rate": 1.720070227168118e-05, "loss": 1.2041, "step": 144 }, { "epoch": 2.2334630350194553, "grad_norm": 7.30387722610904, "learning_rate": 1.7159261569971828e-05, "loss": 1.2661, "step": 145 }, { "epoch": 2.2490272373540856, "grad_norm": 6.274862129099997, "learning_rate": 1.7117573773472418e-05, "loss": 1.3209, "step": 146 }, { "epoch": 2.264591439688716, "grad_norm": 8.052280930110639, "learning_rate": 1.7075640554597278e-05, "loss": 1.1796, "step": 147 }, { "epoch": 2.280155642023346, "grad_norm": 10.893409682259657, "learning_rate": 1.703346359560651e-05, "loss": 1.4143, "step": 148 }, { "epoch": 2.2957198443579765, "grad_norm": 9.090803934496224, "learning_rate": 1.6991044588538455e-05, "loss": 1.1129, "step": 149 }, { "epoch": 2.311284046692607, "grad_norm": 9.407206024176414, "learning_rate": 1.694838523514187e-05, "loss": 1.1586, "step": 150 }, { "epoch": 2.3268482490272375, "grad_norm": 8.517506234140628, "learning_rate": 1.690548724680761e-05, "loss": 1.4231, "step": 151 }, { "epoch": 2.342412451361868, "grad_norm": 6.1621127583650255, "learning_rate": 1.6862352344500004e-05, "loss": 1.3577, "step": 152 }, { "epoch": 2.357976653696498, "grad_norm": 37.87140785888283, "learning_rate": 1.681898225868779e-05, "loss": 1.3384, "step": 153 }, { "epoch": 2.3735408560311284, "grad_norm": 8.232248393098434, "learning_rate": 1.677537872927471e-05, "loss": 1.3798, "step": 154 }, { "epoch": 2.3891050583657587, "grad_norm": 8.407799337670626, "learning_rate": 1.673154350552971e-05, "loss": 1.3535, "step": 155 }, { "epoch": 2.404669260700389, "grad_norm": 11.052092401395148, "learning_rate": 1.6687478346016736e-05, "loss": 1.3003, "step": 156 }, { "epoch": 2.4202334630350193, "grad_norm": 11.914207501283393, "learning_rate": 1.6643185018524227e-05, "loss": 1.2803, "step": 157 }, { "epoch": 2.43579766536965, "grad_norm": 9.369362265914104, "learning_rate": 1.6598665299994162e-05, "loss": 1.3889, "step": 158 }, { "epoch": 2.4513618677042803, "grad_norm": 6.155749688526481, "learning_rate": 1.655392097645079e-05, "loss": 1.3065, "step": 159 }, { "epoch": 2.4669260700389106, "grad_norm": 7.610662694751711, "learning_rate": 1.6508953842928966e-05, "loss": 1.3677, "step": 160 }, { "epoch": 2.482490272373541, "grad_norm": 8.874434851480999, "learning_rate": 1.6463765703402154e-05, "loss": 1.1467, "step": 161 }, { "epoch": 2.498054474708171, "grad_norm": 6.048721329156399, "learning_rate": 1.6418358370710048e-05, "loss": 1.2878, "step": 162 }, { "epoch": 2.5136186770428015, "grad_norm": 8.012765512165412, "learning_rate": 1.6372733666485842e-05, "loss": 1.2369, "step": 163 }, { "epoch": 2.529182879377432, "grad_norm": 9.98813005959193, "learning_rate": 1.6326893421083157e-05, "loss": 1.3142, "step": 164 }, { "epoch": 2.544747081712062, "grad_norm": 6.989195393880204, "learning_rate": 1.6280839473502607e-05, "loss": 1.2651, "step": 165 }, { "epoch": 2.5603112840466924, "grad_norm": 13.328346097167426, "learning_rate": 1.6234573671318027e-05, "loss": 1.5734, "step": 166 }, { "epoch": 2.5758754863813227, "grad_norm": 7.47939944848892, "learning_rate": 1.6188097870602344e-05, "loss": 1.1876, "step": 167 }, { "epoch": 2.5914396887159534, "grad_norm": 9.813447558487908, "learning_rate": 1.614141393585313e-05, "loss": 1.2428, "step": 168 }, { "epoch": 2.6070038910505837, "grad_norm": 8.170087254779242, "learning_rate": 1.6094523739917797e-05, "loss": 1.2607, "step": 169 }, { "epoch": 2.622568093385214, "grad_norm": 6.143715398790098, "learning_rate": 1.6047429163918444e-05, "loss": 1.3353, "step": 170 }, { "epoch": 2.6381322957198443, "grad_norm": 17.125656609647287, "learning_rate": 1.600013209717642e-05, "loss": 1.4865, "step": 171 }, { "epoch": 2.6536964980544746, "grad_norm": 10.776858476646886, "learning_rate": 1.5952634437136523e-05, "loss": 1.5398, "step": 172 }, { "epoch": 2.669260700389105, "grad_norm": 7.0818183172701605, "learning_rate": 1.5904938089290864e-05, "loss": 1.3059, "step": 173 }, { "epoch": 2.6848249027237356, "grad_norm": 20.107104125600667, "learning_rate": 1.5857044967102423e-05, "loss": 1.325, "step": 174 }, { "epoch": 2.700389105058366, "grad_norm": 9.685536590494742, "learning_rate": 1.580895699192831e-05, "loss": 1.2209, "step": 175 }, { "epoch": 2.7159533073929962, "grad_norm": 14.272302800628733, "learning_rate": 1.5760676092942663e-05, "loss": 1.1283, "step": 176 }, { "epoch": 2.7315175097276265, "grad_norm": 8.529426981120846, "learning_rate": 1.571220420705926e-05, "loss": 1.1967, "step": 177 }, { "epoch": 2.747081712062257, "grad_norm": 9.25820017440293, "learning_rate": 1.5663543278853818e-05, "loss": 1.2561, "step": 178 }, { "epoch": 2.762645914396887, "grad_norm": 6.7665962336727725, "learning_rate": 1.5614695260485973e-05, "loss": 1.1381, "step": 179 }, { "epoch": 2.7782101167315174, "grad_norm": 11.677157670781277, "learning_rate": 1.5565662111620967e-05, "loss": 1.123, "step": 180 }, { "epoch": 2.7937743190661477, "grad_norm": 10.612616450023706, "learning_rate": 1.5516445799351046e-05, "loss": 1.4241, "step": 181 }, { "epoch": 2.809338521400778, "grad_norm": 5.785097669619218, "learning_rate": 1.5467048298116516e-05, "loss": 1.1191, "step": 182 }, { "epoch": 2.8249027237354083, "grad_norm": 19.456658401658856, "learning_rate": 1.5417471589626563e-05, "loss": 1.0679, "step": 183 }, { "epoch": 2.840466926070039, "grad_norm": 8.020315674017025, "learning_rate": 1.5367717662779732e-05, "loss": 1.2059, "step": 184 }, { "epoch": 2.8560311284046693, "grad_norm": 9.637600750103296, "learning_rate": 1.531778851358414e-05, "loss": 1.1613, "step": 185 }, { "epoch": 2.8715953307392996, "grad_norm": 6.693770146874348, "learning_rate": 1.5267686145077406e-05, "loss": 1.127, "step": 186 }, { "epoch": 2.88715953307393, "grad_norm": 18.137993374614158, "learning_rate": 1.5217412567246298e-05, "loss": 1.5315, "step": 187 }, { "epoch": 2.90272373540856, "grad_norm": 9.571406690209106, "learning_rate": 1.5166969796946087e-05, "loss": 1.1774, "step": 188 }, { "epoch": 2.9182879377431905, "grad_norm": 19.959548104529457, "learning_rate": 1.5116359857819635e-05, "loss": 1.3707, "step": 189 }, { "epoch": 2.9338521400778212, "grad_norm": 16.169561781876904, "learning_rate": 1.5065584780216225e-05, "loss": 1.3618, "step": 190 }, { "epoch": 2.9494163424124515, "grad_norm": 7.308704556972184, "learning_rate": 1.501464660111009e-05, "loss": 1.5718, "step": 191 }, { "epoch": 2.964980544747082, "grad_norm": 11.175947473356464, "learning_rate": 1.4963547364018711e-05, "loss": 1.1821, "step": 192 }, { "epoch": 2.980544747081712, "grad_norm": 6.880501518603281, "learning_rate": 1.4912289118920821e-05, "loss": 1.2976, "step": 193 }, { "epoch": 2.9961089494163424, "grad_norm": 16.45488939160082, "learning_rate": 1.4860873922174188e-05, "loss": 1.5251, "step": 194 }, { "epoch": 3.0, "grad_norm": 16.45488939160082, "learning_rate": 1.4809303836433086e-05, "loss": 0.3486, "step": 195 }, { "epoch": 3.0155642023346303, "grad_norm": 17.26667705122932, "learning_rate": 1.4757580930565569e-05, "loss": 1.2262, "step": 196 }, { "epoch": 3.0311284046692606, "grad_norm": 7.079930630802389, "learning_rate": 1.4705707279570476e-05, "loss": 1.0964, "step": 197 }, { "epoch": 3.046692607003891, "grad_norm": 5.9707164634211045, "learning_rate": 1.4653684964494163e-05, "loss": 1.1285, "step": 198 }, { "epoch": 3.062256809338521, "grad_norm": 7.374420248201497, "learning_rate": 1.460151607234705e-05, "loss": 1.1528, "step": 199 }, { "epoch": 3.077821011673152, "grad_norm": 7.823768857549868, "learning_rate": 1.4549202696019868e-05, "loss": 1.6089, "step": 200 }, { "epoch": 3.093385214007782, "grad_norm": 13.095423850316486, "learning_rate": 1.44967469341997e-05, "loss": 1.4053, "step": 201 }, { "epoch": 3.1089494163424125, "grad_norm": 8.57838469081796, "learning_rate": 1.4444150891285809e-05, "loss": 1.3911, "step": 202 }, { "epoch": 3.124513618677043, "grad_norm": 10.492752547850309, "learning_rate": 1.4391416677305183e-05, "loss": 1.7466, "step": 203 }, { "epoch": 3.140077821011673, "grad_norm": 12.04907785845439, "learning_rate": 1.4338546407827912e-05, "loss": 1.3069, "step": 204 }, { "epoch": 3.1556420233463034, "grad_norm": 9.082131491175343, "learning_rate": 1.4285542203882301e-05, "loss": 1.3909, "step": 205 }, { "epoch": 3.1712062256809337, "grad_norm": 7.8265612070424035, "learning_rate": 1.4232406191869786e-05, "loss": 1.1016, "step": 206 }, { "epoch": 3.1867704280155644, "grad_norm": 7.017081240597243, "learning_rate": 1.4179140503479622e-05, "loss": 1.3007, "step": 207 }, { "epoch": 3.2023346303501947, "grad_norm": 11.222861362951033, "learning_rate": 1.4125747275603384e-05, "loss": 1.4584, "step": 208 }, { "epoch": 3.217898832684825, "grad_norm": 8.228254518059948, "learning_rate": 1.4072228650249205e-05, "loss": 1.1437, "step": 209 }, { "epoch": 3.2334630350194553, "grad_norm": 5.620105281993818, "learning_rate": 1.4018586774455876e-05, "loss": 1.0801, "step": 210 }, { "epoch": 3.2490272373540856, "grad_norm": 8.270265614809485, "learning_rate": 1.3964823800206698e-05, "loss": 1.4172, "step": 211 }, { "epoch": 3.264591439688716, "grad_norm": 10.759581180311525, "learning_rate": 1.3910941884343144e-05, "loss": 1.3431, "step": 212 }, { "epoch": 3.280155642023346, "grad_norm": 9.256121877658135, "learning_rate": 1.3856943188478353e-05, "loss": 1.1614, "step": 213 }, { "epoch": 3.2957198443579765, "grad_norm": 8.278774263244427, "learning_rate": 1.3802829878910387e-05, "loss": 1.5056, "step": 214 }, { "epoch": 3.311284046692607, "grad_norm": 7.988122466692388, "learning_rate": 1.3748604126535335e-05, "loss": 1.1658, "step": 215 }, { "epoch": 3.3268482490272375, "grad_norm": 25.69960791189383, "learning_rate": 1.3694268106760225e-05, "loss": 1.5151, "step": 216 }, { "epoch": 3.342412451361868, "grad_norm": 6.716460926595037, "learning_rate": 1.3639823999415744e-05, "loss": 1.2914, "step": 217 }, { "epoch": 3.357976653696498, "grad_norm": 16.08954441850179, "learning_rate": 1.3585273988668804e-05, "loss": 1.2714, "step": 218 }, { "epoch": 3.3735408560311284, "grad_norm": 7.277025513470879, "learning_rate": 1.3530620262934892e-05, "loss": 1.3116, "step": 219 }, { "epoch": 3.3891050583657587, "grad_norm": 10.39394733050087, "learning_rate": 1.3475865014790303e-05, "loss": 1.3044, "step": 220 }, { "epoch": 3.404669260700389, "grad_norm": 6.793016895617423, "learning_rate": 1.342101044088416e-05, "loss": 1.6471, "step": 221 }, { "epoch": 3.4202334630350193, "grad_norm": 8.307309122049785, "learning_rate": 1.3366058741850302e-05, "loss": 1.1521, "step": 222 }, { "epoch": 3.43579766536965, "grad_norm": 11.961031173730973, "learning_rate": 1.3311012122218995e-05, "loss": 1.4236, "step": 223 }, { "epoch": 3.4513618677042803, "grad_norm": 12.085462142899722, "learning_rate": 1.3255872790328485e-05, "loss": 1.4304, "step": 224 }, { "epoch": 3.4669260700389106, "grad_norm": 12.043766601436674, "learning_rate": 1.320064295823642e-05, "loss": 1.105, "step": 225 }, { "epoch": 3.482490272373541, "grad_norm": 6.830210730411899, "learning_rate": 1.3145324841631093e-05, "loss": 1.0992, "step": 226 }, { "epoch": 3.498054474708171, "grad_norm": 5.183807189599878, "learning_rate": 1.3089920659742561e-05, "loss": 1.1104, "step": 227 }, { "epoch": 3.5136186770428015, "grad_norm": 8.173248197932283, "learning_rate": 1.3034432635253615e-05, "loss": 1.3416, "step": 228 }, { "epoch": 3.529182879377432, "grad_norm": 6.840510399122009, "learning_rate": 1.2978862994210609e-05, "loss": 1.1301, "step": 229 }, { "epoch": 3.544747081712062, "grad_norm": 6.795335624835114, "learning_rate": 1.2923213965934158e-05, "loss": 1.1824, "step": 230 }, { "epoch": 3.5603112840466924, "grad_norm": 12.487628895610484, "learning_rate": 1.2867487782929702e-05, "loss": 1.4, "step": 231 }, { "epoch": 3.5758754863813227, "grad_norm": 7.498953182696051, "learning_rate": 1.2811686680797942e-05, "loss": 1.0117, "step": 232 }, { "epoch": 3.5914396887159534, "grad_norm": 11.599970960534753, "learning_rate": 1.2755812898145157e-05, "loss": 1.457, "step": 233 }, { "epoch": 3.6070038910505837, "grad_norm": 10.728915546115905, "learning_rate": 1.269986867649339e-05, "loss": 1.0275, "step": 234 }, { "epoch": 3.622568093385214, "grad_norm": 7.212471068564826, "learning_rate": 1.2643856260190533e-05, "loss": 1.2426, "step": 235 }, { "epoch": 3.6381322957198443, "grad_norm": 7.257310852456952, "learning_rate": 1.2587777896320279e-05, "loss": 1.1293, "step": 236 }, { "epoch": 3.6536964980544746, "grad_norm": 5.41922031825349, "learning_rate": 1.2531635834611981e-05, "loss": 1.2113, "step": 237 }, { "epoch": 3.669260700389105, "grad_norm": 6.480868678608291, "learning_rate": 1.2475432327350396e-05, "loss": 1.2706, "step": 238 }, { "epoch": 3.6848249027237356, "grad_norm": 6.80783495792981, "learning_rate": 1.2419169629285335e-05, "loss": 1.0698, "step": 239 }, { "epoch": 3.700389105058366, "grad_norm": 7.234690686703731, "learning_rate": 1.236284999754119e-05, "loss": 1.4999, "step": 240 }, { "epoch": 3.7159533073929962, "grad_norm": 9.331788321366725, "learning_rate": 1.2306475691526407e-05, "loss": 1.0748, "step": 241 }, { "epoch": 3.7315175097276265, "grad_norm": 7.024835954207243, "learning_rate": 1.2250048972842823e-05, "loss": 1.241, "step": 242 }, { "epoch": 3.747081712062257, "grad_norm": 8.381718396842675, "learning_rate": 1.2193572105194953e-05, "loss": 1.4368, "step": 243 }, { "epoch": 3.762645914396887, "grad_norm": 8.467657085467351, "learning_rate": 1.2137047354299165e-05, "loss": 1.1316, "step": 244 }, { "epoch": 3.7782101167315174, "grad_norm": 6.56413902354136, "learning_rate": 1.2080476987792786e-05, "loss": 1.2924, "step": 245 }, { "epoch": 3.7937743190661477, "grad_norm": 11.457150627414464, "learning_rate": 1.2023863275143138e-05, "loss": 1.1807, "step": 246 }, { "epoch": 3.809338521400778, "grad_norm": 11.784032118298212, "learning_rate": 1.1967208487556477e-05, "loss": 1.3601, "step": 247 }, { "epoch": 3.8249027237354083, "grad_norm": 7.618595171387596, "learning_rate": 1.1910514897886892e-05, "loss": 1.0757, "step": 248 }, { "epoch": 3.840466926070039, "grad_norm": 15.913563034938784, "learning_rate": 1.1853784780545123e-05, "loss": 1.381, "step": 249 }, { "epoch": 3.8560311284046693, "grad_norm": 9.448026664890813, "learning_rate": 1.1797020411407303e-05, "loss": 1.1996, "step": 250 }, { "epoch": 3.8715953307392996, "grad_norm": 9.36740858593225, "learning_rate": 1.1740224067723676e-05, "loss": 1.3333, "step": 251 }, { "epoch": 3.88715953307393, "grad_norm": 6.202904532995713, "learning_rate": 1.1683398028027218e-05, "loss": 1.0989, "step": 252 }, { "epoch": 3.90272373540856, "grad_norm": 6.807434814034836, "learning_rate": 1.162654457204224e-05, "loss": 1.0997, "step": 253 }, { "epoch": 3.9182879377431905, "grad_norm": 7.474757873337292, "learning_rate": 1.1569665980592934e-05, "loss": 1.3777, "step": 254 }, { "epoch": 3.9338521400778212, "grad_norm": 10.348844472235802, "learning_rate": 1.1512764535511862e-05, "loss": 1.4729, "step": 255 }, { "epoch": 3.9494163424124515, "grad_norm": 9.595077808457091, "learning_rate": 1.1455842519548417e-05, "loss": 1.1649, "step": 256 }, { "epoch": 3.964980544747082, "grad_norm": 12.298828537750817, "learning_rate": 1.139890221627725e-05, "loss": 1.1849, "step": 257 }, { "epoch": 3.980544747081712, "grad_norm": 9.042727283207734, "learning_rate": 1.1341945910006656e-05, "loss": 1.3065, "step": 258 }, { "epoch": 3.9961089494163424, "grad_norm": 7.530830029119465, "learning_rate": 1.1284975885686926e-05, "loss": 1.2184, "step": 259 }, { "epoch": 4.0, "grad_norm": 7.889222687866491, "learning_rate": 1.1227994428818692e-05, "loss": 0.4148, "step": 260 }, { "epoch": 4.01556420233463, "grad_norm": 7.442531502305794, "learning_rate": 1.1171003825361233e-05, "loss": 1.2908, "step": 261 }, { "epoch": 4.031128404669261, "grad_norm": 5.5861970223083945, "learning_rate": 1.1114006361640763e-05, "loss": 1.1309, "step": 262 }, { "epoch": 4.046692607003891, "grad_norm": 7.19625969261882, "learning_rate": 1.105700432425871e-05, "loss": 1.149, "step": 263 }, { "epoch": 4.062256809338521, "grad_norm": 7.319827903412528, "learning_rate": 1.1000000000000001e-05, "loss": 1.2781, "step": 264 }, { "epoch": 4.0778210116731515, "grad_norm": 8.579128780187071, "learning_rate": 1.094299567574129e-05, "loss": 1.3237, "step": 265 }, { "epoch": 4.093385214007782, "grad_norm": 6.407050357928513, "learning_rate": 1.0885993638359242e-05, "loss": 1.4427, "step": 266 }, { "epoch": 4.108949416342412, "grad_norm": 8.132256321978414, "learning_rate": 1.0828996174638768e-05, "loss": 1.394, "step": 267 }, { "epoch": 4.124513618677042, "grad_norm": 6.3812196369738645, "learning_rate": 1.0772005571181313e-05, "loss": 1.1583, "step": 268 }, { "epoch": 4.1400778210116735, "grad_norm": 13.85707637253458, "learning_rate": 1.0715024114313077e-05, "loss": 1.4429, "step": 269 }, { "epoch": 4.155642023346304, "grad_norm": 21.528973223922698, "learning_rate": 1.0658054089993349e-05, "loss": 1.4796, "step": 270 }, { "epoch": 4.171206225680934, "grad_norm": 6.331202799716651, "learning_rate": 1.0601097783722751e-05, "loss": 1.2535, "step": 271 }, { "epoch": 4.186770428015564, "grad_norm": 6.192886059438914, "learning_rate": 1.0544157480451586e-05, "loss": 1.365, "step": 272 }, { "epoch": 4.202334630350195, "grad_norm": 9.092962942972557, "learning_rate": 1.048723546448814e-05, "loss": 1.2418, "step": 273 }, { "epoch": 4.217898832684825, "grad_norm": 9.088713897338243, "learning_rate": 1.0430334019407069e-05, "loss": 1.5284, "step": 274 }, { "epoch": 4.233463035019455, "grad_norm": 7.935649176333802, "learning_rate": 1.0373455427957762e-05, "loss": 1.1285, "step": 275 }, { "epoch": 4.249027237354086, "grad_norm": 6.747077137812801, "learning_rate": 1.0316601971972787e-05, "loss": 1.1133, "step": 276 }, { "epoch": 4.264591439688716, "grad_norm": 6.190409945190735, "learning_rate": 1.0259775932276325e-05, "loss": 1.309, "step": 277 }, { "epoch": 4.280155642023346, "grad_norm": 7.659504834977579, "learning_rate": 1.0202979588592702e-05, "loss": 1.134, "step": 278 }, { "epoch": 4.2957198443579765, "grad_norm": 6.035623878393655, "learning_rate": 1.0146215219454882e-05, "loss": 1.1742, "step": 279 }, { "epoch": 4.311284046692607, "grad_norm": 14.743602784497947, "learning_rate": 1.0089485102113113e-05, "loss": 1.2349, "step": 280 }, { "epoch": 4.326848249027237, "grad_norm": 34.32059780452938, "learning_rate": 1.0032791512443527e-05, "loss": 1.1312, "step": 281 }, { "epoch": 4.342412451361867, "grad_norm": 6.537432858619071, "learning_rate": 9.976136724856869e-06, "loss": 1.2261, "step": 282 }, { "epoch": 4.357976653696498, "grad_norm": 6.805760395487357, "learning_rate": 9.919523012207217e-06, "loss": 1.2109, "step": 283 }, { "epoch": 4.373540856031129, "grad_norm": 8.577713820915738, "learning_rate": 9.862952645700841e-06, "loss": 1.4719, "step": 284 }, { "epoch": 4.389105058365759, "grad_norm": 5.490284251492559, "learning_rate": 9.806427894805048e-06, "loss": 1.312, "step": 285 }, { "epoch": 4.404669260700389, "grad_norm": 8.327345481649647, "learning_rate": 9.74995102715718e-06, "loss": 1.2461, "step": 286 }, { "epoch": 4.42023346303502, "grad_norm": 6.111747370932479, "learning_rate": 9.693524308473596e-06, "loss": 1.3926, "step": 287 }, { "epoch": 4.43579766536965, "grad_norm": 8.141202180806642, "learning_rate": 9.637150002458813e-06, "loss": 1.2008, "step": 288 }, { "epoch": 4.45136186770428, "grad_norm": 9.12454930317004, "learning_rate": 9.58083037071467e-06, "loss": 1.095, "step": 289 }, { "epoch": 4.466926070038911, "grad_norm": 8.109583837297146, "learning_rate": 9.524567672649606e-06, "loss": 1.1697, "step": 290 }, { "epoch": 4.482490272373541, "grad_norm": 6.940093378941827, "learning_rate": 9.468364165388022e-06, "loss": 1.5673, "step": 291 }, { "epoch": 4.498054474708171, "grad_norm": 8.029108149503637, "learning_rate": 9.412222103679728e-06, "loss": 1.4509, "step": 292 }, { "epoch": 4.5136186770428015, "grad_norm": 7.3797751032553345, "learning_rate": 9.356143739809472e-06, "loss": 1.1467, "step": 293 }, { "epoch": 4.529182879377432, "grad_norm": 11.435244944138505, "learning_rate": 9.300131323506617e-06, "loss": 1.1765, "step": 294 }, { "epoch": 4.544747081712062, "grad_norm": 9.332357406857541, "learning_rate": 9.244187101854846e-06, "loss": 1.2847, "step": 295 }, { "epoch": 4.560311284046692, "grad_norm": 7.009210489706155, "learning_rate": 9.188313319202057e-06, "loss": 1.3632, "step": 296 }, { "epoch": 4.575875486381323, "grad_norm": 8.623706053802827, "learning_rate": 9.132512217070301e-06, "loss": 1.3002, "step": 297 }, { "epoch": 4.591439688715953, "grad_norm": 7.10477639818194, "learning_rate": 9.076786034065843e-06, "loss": 1.2559, "step": 298 }, { "epoch": 4.607003891050583, "grad_norm": 16.24344498357353, "learning_rate": 9.021137005789393e-06, "loss": 1.3538, "step": 299 }, { "epoch": 4.622568093385214, "grad_norm": 5.84525864201727, "learning_rate": 8.965567364746388e-06, "loss": 1.2, "step": 300 }, { "epoch": 4.638132295719844, "grad_norm": 14.06325953577117, "learning_rate": 8.910079340257442e-06, "loss": 1.1461, "step": 301 }, { "epoch": 4.653696498054475, "grad_norm": 7.743168891211479, "learning_rate": 8.854675158368908e-06, "loss": 1.3149, "step": 302 }, { "epoch": 4.669260700389105, "grad_norm": 5.646503833678955, "learning_rate": 8.799357041763583e-06, "loss": 1.1332, "step": 303 }, { "epoch": 4.684824902723736, "grad_norm": 7.846526869237445, "learning_rate": 8.744127209671516e-06, "loss": 1.2928, "step": 304 }, { "epoch": 4.700389105058366, "grad_norm": 24.312874392544217, "learning_rate": 8.688987877781008e-06, "loss": 1.4022, "step": 305 }, { "epoch": 4.715953307392996, "grad_norm": 11.006982390019688, "learning_rate": 8.633941258149699e-06, "loss": 1.6169, "step": 306 }, { "epoch": 4.7315175097276265, "grad_norm": 8.161038346958115, "learning_rate": 8.578989559115842e-06, "loss": 1.1205, "step": 307 }, { "epoch": 4.747081712062257, "grad_norm": 10.360221596629799, "learning_rate": 8.524134985209698e-06, "loss": 1.1426, "step": 308 }, { "epoch": 4.762645914396887, "grad_norm": 8.086983191969743, "learning_rate": 8.46937973706511e-06, "loss": 1.1742, "step": 309 }, { "epoch": 4.778210116731517, "grad_norm": 7.93821224397039, "learning_rate": 8.414726011331197e-06, "loss": 1.3003, "step": 310 }, { "epoch": 4.793774319066148, "grad_norm": 6.8964113141193195, "learning_rate": 8.360176000584257e-06, "loss": 1.3921, "step": 311 }, { "epoch": 4.809338521400778, "grad_norm": 9.541918636824354, "learning_rate": 8.30573189323978e-06, "loss": 1.4406, "step": 312 }, { "epoch": 4.824902723735408, "grad_norm": 9.88288044965242, "learning_rate": 8.251395873464671e-06, "loss": 1.3104, "step": 313 }, { "epoch": 4.840466926070039, "grad_norm": 7.034997485501755, "learning_rate": 8.197170121089617e-06, "loss": 1.4781, "step": 314 }, { "epoch": 4.856031128404669, "grad_norm": 7.024182095026262, "learning_rate": 8.143056811521653e-06, "loss": 1.0645, "step": 315 }, { "epoch": 4.8715953307393, "grad_norm": 6.763562775528538, "learning_rate": 8.089058115656859e-06, "loss": 1.1622, "step": 316 }, { "epoch": 4.88715953307393, "grad_norm": 22.915236408148893, "learning_rate": 8.035176199793309e-06, "loss": 1.2201, "step": 317 }, { "epoch": 4.902723735408561, "grad_norm": 22.9486570021447, "learning_rate": 7.981413225544128e-06, "loss": 1.3326, "step": 318 }, { "epoch": 4.918287937743191, "grad_norm": 7.766492426098674, "learning_rate": 7.9277713497508e-06, "loss": 1.3444, "step": 319 }, { "epoch": 4.933852140077821, "grad_norm": 8.922409990504782, "learning_rate": 7.87425272439662e-06, "loss": 1.2533, "step": 320 }, { "epoch": 4.9494163424124515, "grad_norm": 10.120435136565488, "learning_rate": 7.82085949652038e-06, "loss": 1.4161, "step": 321 }, { "epoch": 4.964980544747082, "grad_norm": 11.585619162092177, "learning_rate": 7.767593808130216e-06, "loss": 1.408, "step": 322 }, { "epoch": 4.980544747081712, "grad_norm": 8.068469524272025, "learning_rate": 7.714457796117705e-06, "loss": 1.175, "step": 323 }, { "epoch": 4.996108949416342, "grad_norm": 6.472014606610803, "learning_rate": 7.661453592172093e-06, "loss": 1.356, "step": 324 }, { "epoch": 5.0, "grad_norm": 6.472014606610803, "learning_rate": 7.60858332269482e-06, "loss": 0.4053, "step": 325 } ], "logging_steps": 1.0, "max_steps": 512, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 176455044300800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }