| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 100, | |
| "global_step": 1408, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007109215320359016, | |
| "grad_norm": 22.657577985866105, | |
| "learning_rate": 9.302325581395349e-06, | |
| "loss": 2.574, | |
| "mean_token_accuracy": 0.5464246176183224, | |
| "num_tokens": 4589382.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.014218430640718031, | |
| "grad_norm": 2.3543289370923013, | |
| "learning_rate": 2.0930232558139536e-05, | |
| "loss": 1.4882, | |
| "mean_token_accuracy": 0.6589333653450012, | |
| "num_tokens": 9171524.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.021327645961077047, | |
| "grad_norm": 0.8063547574982903, | |
| "learning_rate": 3.2558139534883724e-05, | |
| "loss": 1.0174, | |
| "mean_token_accuracy": 0.7330243036150932, | |
| "num_tokens": 13765157.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.028436861281436063, | |
| "grad_norm": 0.572573905518242, | |
| "learning_rate": 4.418604651162791e-05, | |
| "loss": 0.8773, | |
| "mean_token_accuracy": 0.7569610200822353, | |
| "num_tokens": 18369874.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.035546076601795075, | |
| "grad_norm": 0.5738482260117446, | |
| "learning_rate": 5.5813953488372095e-05, | |
| "loss": 0.7975, | |
| "mean_token_accuracy": 0.7729738861322403, | |
| "num_tokens": 22960290.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.042655291922154094, | |
| "grad_norm": 0.5016568944917689, | |
| "learning_rate": 6.744186046511628e-05, | |
| "loss": 0.7632, | |
| "mean_token_accuracy": 0.778630904853344, | |
| "num_tokens": 27556623.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.049764507242513106, | |
| "grad_norm": 0.4845613474361907, | |
| "learning_rate": 7.906976744186047e-05, | |
| "loss": 0.7326, | |
| "mean_token_accuracy": 0.7872321248054505, | |
| "num_tokens": 32158408.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.056873722562872125, | |
| "grad_norm": 0.4270154516127363, | |
| "learning_rate": 9.069767441860465e-05, | |
| "loss": 0.7095, | |
| "mean_token_accuracy": 0.7919960044324398, | |
| "num_tokens": 36742233.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06398293788323114, | |
| "grad_norm": 0.499498695141066, | |
| "learning_rate": 9.9999880816326e-05, | |
| "loss": 0.6973, | |
| "mean_token_accuracy": 0.7952379912137986, | |
| "num_tokens": 41335670.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07109215320359015, | |
| "grad_norm": 0.4645180201543763, | |
| "learning_rate": 9.999570945402425e-05, | |
| "loss": 0.6853, | |
| "mean_token_accuracy": 0.7981184311211109, | |
| "num_tokens": 45940079.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.07820136852394917, | |
| "grad_norm": 0.434255531179794, | |
| "learning_rate": 9.998557953932929e-05, | |
| "loss": 0.6688, | |
| "mean_token_accuracy": 0.8012012615799904, | |
| "num_tokens": 50533771.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.08531058384430819, | |
| "grad_norm": 0.393754634337621, | |
| "learning_rate": 9.99694924136941e-05, | |
| "loss": 0.6725, | |
| "mean_token_accuracy": 0.800255061686039, | |
| "num_tokens": 55133444.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0924197991646672, | |
| "grad_norm": 0.49718727212066355, | |
| "learning_rate": 9.99474502074547e-05, | |
| "loss": 0.6664, | |
| "mean_token_accuracy": 0.801218880712986, | |
| "num_tokens": 59726447.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.09952901448502621, | |
| "grad_norm": 0.4005142024066312, | |
| "learning_rate": 9.991945583954808e-05, | |
| "loss": 0.6549, | |
| "mean_token_accuracy": 0.8056452445685863, | |
| "num_tokens": 64319917.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.10663822980538523, | |
| "grad_norm": 0.3774090383980249, | |
| "learning_rate": 9.988551301712567e-05, | |
| "loss": 0.6454, | |
| "mean_token_accuracy": 0.806719920784235, | |
| "num_tokens": 68898868.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.11374744512574425, | |
| "grad_norm": 0.3995895890256704, | |
| "learning_rate": 9.984562623506235e-05, | |
| "loss": 0.6464, | |
| "mean_token_accuracy": 0.8064703330397606, | |
| "num_tokens": 73481972.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.12085666044610326, | |
| "grad_norm": 0.3801619159341505, | |
| "learning_rate": 9.979980077536136e-05, | |
| "loss": 0.6462, | |
| "mean_token_accuracy": 0.8080633491277694, | |
| "num_tokens": 78079419.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.1279658757664623, | |
| "grad_norm": 0.37074794226689833, | |
| "learning_rate": 9.974804270645462e-05, | |
| "loss": 0.6362, | |
| "mean_token_accuracy": 0.8091117829084397, | |
| "num_tokens": 82670195.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.13507509108682128, | |
| "grad_norm": 0.37193721608812236, | |
| "learning_rate": 9.969035888239937e-05, | |
| "loss": 0.635, | |
| "mean_token_accuracy": 0.8079991653561592, | |
| "num_tokens": 87257953.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1421843064071803, | |
| "grad_norm": 0.36251703620037773, | |
| "learning_rate": 9.96267569419703e-05, | |
| "loss": 0.6315, | |
| "mean_token_accuracy": 0.8096475720405578, | |
| "num_tokens": 91838382.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1421843064071803, | |
| "eval_loss": 0.5971412062644958, | |
| "eval_mean_token_accuracy": 0.8093206621052926, | |
| "eval_num_tokens": 91838382.0, | |
| "eval_runtime": 141.8153, | |
| "eval_samples_per_second": 25.653, | |
| "eval_steps_per_second": 0.804, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.14929352172753932, | |
| "grad_norm": 0.41583625971563776, | |
| "learning_rate": 9.955724530764809e-05, | |
| "loss": 0.6381, | |
| "mean_token_accuracy": 0.8077230393886566, | |
| "num_tokens": 96431755.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.15640273704789834, | |
| "grad_norm": 0.3705693803444073, | |
| "learning_rate": 9.948183318450413e-05, | |
| "loss": 0.6197, | |
| "mean_token_accuracy": 0.8116156131029129, | |
| "num_tokens": 101027690.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.16351195236825736, | |
| "grad_norm": 0.3214510651452395, | |
| "learning_rate": 9.940053055898133e-05, | |
| "loss": 0.6313, | |
| "mean_token_accuracy": 0.8089181430637836, | |
| "num_tokens": 105628547.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.17062116768861638, | |
| "grad_norm": 0.34220731720085373, | |
| "learning_rate": 9.93133481975719e-05, | |
| "loss": 0.6077, | |
| "mean_token_accuracy": 0.814984206855297, | |
| "num_tokens": 110243592.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1777303830089754, | |
| "grad_norm": 0.35675802487560043, | |
| "learning_rate": 9.922029764539148e-05, | |
| "loss": 0.6263, | |
| "mean_token_accuracy": 0.8096928559243679, | |
| "num_tokens": 114832845.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1848395983293344, | |
| "grad_norm": 0.3422296936678833, | |
| "learning_rate": 9.912139122465027e-05, | |
| "loss": 0.6116, | |
| "mean_token_accuracy": 0.8140982151031494, | |
| "num_tokens": 119435421.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1919488136496934, | |
| "grad_norm": 0.3599918244922273, | |
| "learning_rate": 9.901664203302126e-05, | |
| "loss": 0.6052, | |
| "mean_token_accuracy": 0.8154805108904839, | |
| "num_tokens": 124028647.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.19905802897005243, | |
| "grad_norm": 0.3595154303423279, | |
| "learning_rate": 9.890606394190588e-05, | |
| "loss": 0.6126, | |
| "mean_token_accuracy": 0.8132404424250126, | |
| "num_tokens": 128628413.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.20616724429041144, | |
| "grad_norm": 0.3711012466200944, | |
| "learning_rate": 9.878967159459693e-05, | |
| "loss": 0.6068, | |
| "mean_token_accuracy": 0.8164977565407753, | |
| "num_tokens": 133219422.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.21327645961077046, | |
| "grad_norm": 0.35910926284617867, | |
| "learning_rate": 9.866748040433956e-05, | |
| "loss": 0.6099, | |
| "mean_token_accuracy": 0.8152773261070252, | |
| "num_tokens": 137825952.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.22038567493112948, | |
| "grad_norm": 0.4205439208166243, | |
| "learning_rate": 9.853950655229009e-05, | |
| "loss": 0.6064, | |
| "mean_token_accuracy": 0.815191026777029, | |
| "num_tokens": 142422368.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.2274948902514885, | |
| "grad_norm": 0.32091150374802263, | |
| "learning_rate": 9.840576698537329e-05, | |
| "loss": 0.6093, | |
| "mean_token_accuracy": 0.8135301224887371, | |
| "num_tokens": 147015990.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.23460410557184752, | |
| "grad_norm": 0.32627028158119226, | |
| "learning_rate": 9.826627941403811e-05, | |
| "loss": 0.5969, | |
| "mean_token_accuracy": 0.8182829037308693, | |
| "num_tokens": 151627096.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2417133208922065, | |
| "grad_norm": 0.32405674248273, | |
| "learning_rate": 9.812106230991248e-05, | |
| "loss": 0.6068, | |
| "mean_token_accuracy": 0.8159149341285229, | |
| "num_tokens": 156218968.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.24882253621256553, | |
| "grad_norm": 0.3206982540127891, | |
| "learning_rate": 9.79701349033571e-05, | |
| "loss": 0.6039, | |
| "mean_token_accuracy": 0.8161494679749012, | |
| "num_tokens": 160797401.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2559317515329246, | |
| "grad_norm": 0.3360732448004463, | |
| "learning_rate": 9.78135171809189e-05, | |
| "loss": 0.6068, | |
| "mean_token_accuracy": 0.8159954428672791, | |
| "num_tokens": 165402684.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.26304096685328354, | |
| "grad_norm": 0.33789233259366286, | |
| "learning_rate": 9.76512298826844e-05, | |
| "loss": 0.6026, | |
| "mean_token_accuracy": 0.8167447924613953, | |
| "num_tokens": 169997282.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.27015018217364256, | |
| "grad_norm": 0.3089560668153988, | |
| "learning_rate": 9.748329449953302e-05, | |
| "loss": 0.5904, | |
| "mean_token_accuracy": 0.8193566597998142, | |
| "num_tokens": 174589836.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2772593974940016, | |
| "grad_norm": 0.32060053414915524, | |
| "learning_rate": 9.73097332702914e-05, | |
| "loss": 0.6044, | |
| "mean_token_accuracy": 0.8175870932638645, | |
| "num_tokens": 179181747.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2843686128143606, | |
| "grad_norm": 0.32004664048912745, | |
| "learning_rate": 9.713056917878818e-05, | |
| "loss": 0.5888, | |
| "mean_token_accuracy": 0.8192018747329712, | |
| "num_tokens": 183760367.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2843686128143606, | |
| "eval_loss": 0.5599971413612366, | |
| "eval_mean_token_accuracy": 0.8188303473748659, | |
| "eval_num_tokens": 183760367.0, | |
| "eval_runtime": 145.8536, | |
| "eval_samples_per_second": 24.943, | |
| "eval_steps_per_second": 0.782, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2914778281347196, | |
| "grad_norm": 0.3094551492752116, | |
| "learning_rate": 9.694582595081057e-05, | |
| "loss": 0.5872, | |
| "mean_token_accuracy": 0.819921114295721, | |
| "num_tokens": 188360903.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.29858704345507864, | |
| "grad_norm": 0.36254147904822126, | |
| "learning_rate": 9.67555280509623e-05, | |
| "loss": 0.5942, | |
| "mean_token_accuracy": 0.817745155096054, | |
| "num_tokens": 192932381.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.30569625877543766, | |
| "grad_norm": 0.3377909564779145, | |
| "learning_rate": 9.655970067942405e-05, | |
| "loss": 0.5994, | |
| "mean_token_accuracy": 0.8163805276155471, | |
| "num_tokens": 197505985.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.3128054740957967, | |
| "grad_norm": 0.30751780494672465, | |
| "learning_rate": 9.63583697686162e-05, | |
| "loss": 0.5902, | |
| "mean_token_accuracy": 0.8196643941104412, | |
| "num_tokens": 202105424.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3199146894161557, | |
| "grad_norm": 0.34345028355301316, | |
| "learning_rate": 9.615156197976477e-05, | |
| "loss": 0.582, | |
| "mean_token_accuracy": 0.8217154465615749, | |
| "num_tokens": 206686951.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3270239047365147, | |
| "grad_norm": 0.3216135018716631, | |
| "learning_rate": 9.593930469937087e-05, | |
| "loss": 0.5708, | |
| "mean_token_accuracy": 0.8250658005475998, | |
| "num_tokens": 211278788.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.33413312005687373, | |
| "grad_norm": 0.32564659909940696, | |
| "learning_rate": 9.572162603558393e-05, | |
| "loss": 0.5928, | |
| "mean_token_accuracy": 0.819525595754385, | |
| "num_tokens": 215877205.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.34124233537723275, | |
| "grad_norm": 0.4839583335140069, | |
| "learning_rate": 9.549855481447954e-05, | |
| "loss": 0.5882, | |
| "mean_token_accuracy": 0.8204580388963223, | |
| "num_tokens": 220486454.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.34835155069759177, | |
| "grad_norm": 0.3268671171699921, | |
| "learning_rate": 9.527012057624224e-05, | |
| "loss": 0.5836, | |
| "mean_token_accuracy": 0.8208626843988895, | |
| "num_tokens": 225080225.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3554607660179508, | |
| "grad_norm": 0.3244498327733708, | |
| "learning_rate": 9.50363535712535e-05, | |
| "loss": 0.586, | |
| "mean_token_accuracy": 0.8207595020532608, | |
| "num_tokens": 229657012.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3625699813383098, | |
| "grad_norm": 0.29889265357291406, | |
| "learning_rate": 9.479728475608593e-05, | |
| "loss": 0.5919, | |
| "mean_token_accuracy": 0.8190862230956555, | |
| "num_tokens": 234248976.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.3696791966586688, | |
| "grad_norm": 0.34636883393423384, | |
| "learning_rate": 9.455294578940384e-05, | |
| "loss": 0.5765, | |
| "mean_token_accuracy": 0.8226364821195602, | |
| "num_tokens": 238829734.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.3767884119790278, | |
| "grad_norm": 0.3092592234408446, | |
| "learning_rate": 9.430336902777083e-05, | |
| "loss": 0.576, | |
| "mean_token_accuracy": 0.821333235502243, | |
| "num_tokens": 243418989.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3838976272993868, | |
| "grad_norm": 0.30454136223380207, | |
| "learning_rate": 9.404858752136499e-05, | |
| "loss": 0.5771, | |
| "mean_token_accuracy": 0.8237294301390647, | |
| "num_tokens": 248015701.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.39100684261974583, | |
| "grad_norm": 0.30289215095264577, | |
| "learning_rate": 9.378863500960222e-05, | |
| "loss": 0.5709, | |
| "mean_token_accuracy": 0.8236084163188935, | |
| "num_tokens": 252613191.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.39811605794010485, | |
| "grad_norm": 0.3010273864601919, | |
| "learning_rate": 9.352354591666827e-05, | |
| "loss": 0.5861, | |
| "mean_token_accuracy": 0.820894256979227, | |
| "num_tokens": 257210808.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.40522527326046387, | |
| "grad_norm": 0.30175911100812025, | |
| "learning_rate": 9.325335534696017e-05, | |
| "loss": 0.5753, | |
| "mean_token_accuracy": 0.8225005254149437, | |
| "num_tokens": 261790131.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4123344885808229, | |
| "grad_norm": 0.28871941798325856, | |
| "learning_rate": 9.29780990804375e-05, | |
| "loss": 0.5799, | |
| "mean_token_accuracy": 0.821347926557064, | |
| "num_tokens": 266377324.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4194437039011819, | |
| "grad_norm": 0.28095014086273895, | |
| "learning_rate": 9.269781356788424e-05, | |
| "loss": 0.581, | |
| "mean_token_accuracy": 0.8209108576178551, | |
| "num_tokens": 270967910.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4265529192215409, | |
| "grad_norm": 0.2893211807696515, | |
| "learning_rate": 9.241253592608183e-05, | |
| "loss": 0.5755, | |
| "mean_token_accuracy": 0.8242007777094841, | |
| "num_tokens": 275570273.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4265529192215409, | |
| "eval_loss": 0.5416839122772217, | |
| "eval_mean_token_accuracy": 0.8231211885025627, | |
| "eval_num_tokens": 275570273.0, | |
| "eval_runtime": 145.5254, | |
| "eval_samples_per_second": 24.999, | |
| "eval_steps_per_second": 0.783, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.43366213454189995, | |
| "grad_norm": 0.30733885282429685, | |
| "learning_rate": 9.212230393289385e-05, | |
| "loss": 0.5781, | |
| "mean_token_accuracy": 0.8230207331478596, | |
| "num_tokens": 280172533.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.44077134986225897, | |
| "grad_norm": 0.2682470819307261, | |
| "learning_rate": 9.182715602226341e-05, | |
| "loss": 0.5625, | |
| "mean_token_accuracy": 0.8270745746791363, | |
| "num_tokens": 284763929.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.447880565182618, | |
| "grad_norm": 0.2962012849994535, | |
| "learning_rate": 9.152713127912355e-05, | |
| "loss": 0.5848, | |
| "mean_token_accuracy": 0.8201167277991772, | |
| "num_tokens": 289376903.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.454989780502977, | |
| "grad_norm": 0.28564514411407316, | |
| "learning_rate": 9.12222694342213e-05, | |
| "loss": 0.5732, | |
| "mean_token_accuracy": 0.8246621482074261, | |
| "num_tokens": 293966796.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.462098995823336, | |
| "grad_norm": 0.30020425973519915, | |
| "learning_rate": 9.091261085885646e-05, | |
| "loss": 0.5606, | |
| "mean_token_accuracy": 0.826822079718113, | |
| "num_tokens": 298540346.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.46920821114369504, | |
| "grad_norm": 0.2887047887642146, | |
| "learning_rate": 9.059819655953536e-05, | |
| "loss": 0.5738, | |
| "mean_token_accuracy": 0.823461939394474, | |
| "num_tokens": 303112604.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.476317426464054, | |
| "grad_norm": 0.3180269352697689, | |
| "learning_rate": 9.027906817254063e-05, | |
| "loss": 0.5654, | |
| "mean_token_accuracy": 0.8256018176674843, | |
| "num_tokens": 307694241.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.483426641784413, | |
| "grad_norm": 0.29567931374872014, | |
| "learning_rate": 8.995526795841753e-05, | |
| "loss": 0.558, | |
| "mean_token_accuracy": 0.8256605207920075, | |
| "num_tokens": 312289299.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.49053585710477204, | |
| "grad_norm": 0.3336504103662035, | |
| "learning_rate": 8.962683879637747e-05, | |
| "loss": 0.5617, | |
| "mean_token_accuracy": 0.8257805988192558, | |
| "num_tokens": 316884766.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.49764507242513106, | |
| "grad_norm": 0.3705167375534613, | |
| "learning_rate": 8.929382417861991e-05, | |
| "loss": 0.561, | |
| "mean_token_accuracy": 0.8267210200428963, | |
| "num_tokens": 321461198.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5047542877454901, | |
| "grad_norm": 0.2946584460529412, | |
| "learning_rate": 8.895626820457283e-05, | |
| "loss": 0.557, | |
| "mean_token_accuracy": 0.828194110840559, | |
| "num_tokens": 326064722.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5118635030658492, | |
| "grad_norm": 0.31227448766803945, | |
| "learning_rate": 8.861421557505282e-05, | |
| "loss": 0.5522, | |
| "mean_token_accuracy": 0.8295037761330605, | |
| "num_tokens": 330652094.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5189727183862082, | |
| "grad_norm": 1.0759474066945163, | |
| "learning_rate": 8.826771158634567e-05, | |
| "loss": 0.5629, | |
| "mean_token_accuracy": 0.8260238766670227, | |
| "num_tokens": 335255835.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5260819337065671, | |
| "grad_norm": 0.2758992633553522, | |
| "learning_rate": 8.791680212420797e-05, | |
| "loss": 0.5502, | |
| "mean_token_accuracy": 0.828965923935175, | |
| "num_tokens": 339843476.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5331911490269261, | |
| "grad_norm": 0.29696149610793166, | |
| "learning_rate": 8.756153365779066e-05, | |
| "loss": 0.5542, | |
| "mean_token_accuracy": 0.8278730027377605, | |
| "num_tokens": 344420533.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5403003643472851, | |
| "grad_norm": 0.284706804181623, | |
| "learning_rate": 8.720195323348545e-05, | |
| "loss": 0.559, | |
| "mean_token_accuracy": 0.8278782211244107, | |
| "num_tokens": 349010370.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.5474095796676441, | |
| "grad_norm": 0.3046957362601185, | |
| "learning_rate": 8.68381084686946e-05, | |
| "loss": 0.5576, | |
| "mean_token_accuracy": 0.8258513130247593, | |
| "num_tokens": 353598451.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.5545187949880032, | |
| "grad_norm": 0.3134773718519533, | |
| "learning_rate": 8.647004754552526e-05, | |
| "loss": 0.5612, | |
| "mean_token_accuracy": 0.8255665130913258, | |
| "num_tokens": 358195615.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.5616280103083622, | |
| "grad_norm": 0.33349640254961, | |
| "learning_rate": 8.609781920440891e-05, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.8278413727879524, | |
| "num_tokens": 362764034.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.5687372256287212, | |
| "grad_norm": 0.32034152048464726, | |
| "learning_rate": 8.5721472737647e-05, | |
| "loss": 0.5534, | |
| "mean_token_accuracy": 0.8273369200527668, | |
| "num_tokens": 367350265.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5687372256287212, | |
| "eval_loss": 0.5274047255516052, | |
| "eval_mean_token_accuracy": 0.8264030280866121, | |
| "eval_num_tokens": 367350265.0, | |
| "eval_runtime": 146.0134, | |
| "eval_samples_per_second": 24.916, | |
| "eval_steps_per_second": 0.781, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5758464409490802, | |
| "grad_norm": 0.29085093151843905, | |
| "learning_rate": 8.534105798288331e-05, | |
| "loss": 0.5506, | |
| "mean_token_accuracy": 0.830031219124794, | |
| "num_tokens": 371939618.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.5829556562694392, | |
| "grad_norm": 0.27710417408529203, | |
| "learning_rate": 8.49566253165043e-05, | |
| "loss": 0.5439, | |
| "mean_token_accuracy": 0.8304261237382888, | |
| "num_tokens": 376519800.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.5900648715897983, | |
| "grad_norm": 0.2611394917691902, | |
| "learning_rate": 8.456822564696789e-05, | |
| "loss": 0.5409, | |
| "mean_token_accuracy": 0.832954341173172, | |
| "num_tokens": 381102299.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.5971740869101573, | |
| "grad_norm": 0.42771473321829473, | |
| "learning_rate": 8.417591040806213e-05, | |
| "loss": 0.5504, | |
| "mean_token_accuracy": 0.8300940133631229, | |
| "num_tokens": 385700779.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6042833022305163, | |
| "grad_norm": 0.28194050483515865, | |
| "learning_rate": 8.377973155209387e-05, | |
| "loss": 0.5553, | |
| "mean_token_accuracy": 0.8270630918443203, | |
| "num_tokens": 390294365.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6113925175508753, | |
| "grad_norm": 0.27563889901609234, | |
| "learning_rate": 8.337974154300913e-05, | |
| "loss": 0.5427, | |
| "mean_token_accuracy": 0.8309814311563969, | |
| "num_tokens": 394889149.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6185017328712343, | |
| "grad_norm": 0.27875362292884753, | |
| "learning_rate": 8.297599334944542e-05, | |
| "loss": 0.5561, | |
| "mean_token_accuracy": 0.8275676898658275, | |
| "num_tokens": 399459807.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6256109481915934, | |
| "grad_norm": 0.7336148967265075, | |
| "learning_rate": 8.256854043771754e-05, | |
| "loss": 0.5507, | |
| "mean_token_accuracy": 0.8285100273787975, | |
| "num_tokens": 404034333.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6327201635119524, | |
| "grad_norm": 0.3259646654441019, | |
| "learning_rate": 8.215743676473719e-05, | |
| "loss": 0.5503, | |
| "mean_token_accuracy": 0.8290993146598339, | |
| "num_tokens": 408627270.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6398293788323114, | |
| "grad_norm": 0.3012299941832976, | |
| "learning_rate": 8.174273677086779e-05, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.8279682919383049, | |
| "num_tokens": 413222911.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.6469385941526704, | |
| "grad_norm": 0.30771992691522176, | |
| "learning_rate": 8.132449537271519e-05, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.8296807646751404, | |
| "num_tokens": 417806274.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.6540478094730294, | |
| "grad_norm": 0.2810763807856677, | |
| "learning_rate": 8.090276795585531e-05, | |
| "loss": 0.5414, | |
| "mean_token_accuracy": 0.8314659893512726, | |
| "num_tokens": 422401434.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.6611570247933884, | |
| "grad_norm": 0.2672336811508722, | |
| "learning_rate": 8.047761036749985e-05, | |
| "loss": 0.5564, | |
| "mean_token_accuracy": 0.8265900291502476, | |
| "num_tokens": 426986385.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.6682662401137475, | |
| "grad_norm": 0.25924906311163326, | |
| "learning_rate": 8.004907890910055e-05, | |
| "loss": 0.5452, | |
| "mean_token_accuracy": 0.8297064855694771, | |
| "num_tokens": 431585703.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.6753754554341065, | |
| "grad_norm": 0.2772688573388134, | |
| "learning_rate": 7.961723032889358e-05, | |
| "loss": 0.5292, | |
| "mean_token_accuracy": 0.8346129797399044, | |
| "num_tokens": 436150194.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.6824846707544655, | |
| "grad_norm": 0.25573353155086187, | |
| "learning_rate": 7.918212181438467e-05, | |
| "loss": 0.5397, | |
| "mean_token_accuracy": 0.8314497999846935, | |
| "num_tokens": 440736901.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.6895938860748245, | |
| "grad_norm": 0.2640386419783165, | |
| "learning_rate": 7.874381098477599e-05, | |
| "loss": 0.5359, | |
| "mean_token_accuracy": 0.8328767582774163, | |
| "num_tokens": 445334774.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.6967031013951835, | |
| "grad_norm": 0.2662269663206075, | |
| "learning_rate": 7.830235588333597e-05, | |
| "loss": 0.5578, | |
| "mean_token_accuracy": 0.8268053226172924, | |
| "num_tokens": 449908855.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7038123167155426, | |
| "grad_norm": 0.2756351015892551, | |
| "learning_rate": 7.785781496971297e-05, | |
| "loss": 0.5503, | |
| "mean_token_accuracy": 0.8284729138016701, | |
| "num_tokens": 454513487.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7109215320359016, | |
| "grad_norm": 0.4547105928976161, | |
| "learning_rate": 7.741024711219366e-05, | |
| "loss": 0.5431, | |
| "mean_token_accuracy": 0.8298681430518627, | |
| "num_tokens": 459106365.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7109215320359016, | |
| "eval_loss": 0.5168540477752686, | |
| "eval_mean_token_accuracy": 0.8290872861418808, | |
| "eval_num_tokens": 459106365.0, | |
| "eval_runtime": 146.2066, | |
| "eval_samples_per_second": 24.883, | |
| "eval_steps_per_second": 0.78, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7180307473562606, | |
| "grad_norm": 1.6021704699780053, | |
| "learning_rate": 7.695971157990754e-05, | |
| "loss": 0.5646, | |
| "mean_token_accuracy": 0.8263038910925389, | |
| "num_tokens": 463703240.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7251399626766196, | |
| "grad_norm": 4.625968090811763, | |
| "learning_rate": 7.650626803497806e-05, | |
| "loss": 0.5581, | |
| "mean_token_accuracy": 0.8270722553133965, | |
| "num_tokens": 468295660.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.7322491779969785, | |
| "grad_norm": 0.27503115183353516, | |
| "learning_rate": 7.604997652462205e-05, | |
| "loss": 0.5492, | |
| "mean_token_accuracy": 0.8294327199459076, | |
| "num_tokens": 472896751.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.7393583933173375, | |
| "grad_norm": 0.267416722991217, | |
| "learning_rate": 7.55908974731978e-05, | |
| "loss": 0.5418, | |
| "mean_token_accuracy": 0.8326966613531113, | |
| "num_tokens": 477480918.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.7464676086376966, | |
| "grad_norm": 0.25628361203172423, | |
| "learning_rate": 7.512909167420347e-05, | |
| "loss": 0.5404, | |
| "mean_token_accuracy": 0.8324044570326805, | |
| "num_tokens": 482064392.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.7535768239580556, | |
| "grad_norm": 0.24597696845219366, | |
| "learning_rate": 7.466462028222654e-05, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.8331540204584599, | |
| "num_tokens": 486649806.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.7606860392784146, | |
| "grad_norm": 0.2497969231256322, | |
| "learning_rate": 7.419754480484536e-05, | |
| "loss": 0.5378, | |
| "mean_token_accuracy": 0.8323175966739654, | |
| "num_tokens": 491217398.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.7677952545987736, | |
| "grad_norm": 0.27136426093422567, | |
| "learning_rate": 7.3727927094484e-05, | |
| "loss": 0.5303, | |
| "mean_token_accuracy": 0.8346898458898068, | |
| "num_tokens": 495798334.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.7749044699191326, | |
| "grad_norm": 0.263928683082665, | |
| "learning_rate": 7.32558293402215e-05, | |
| "loss": 0.5193, | |
| "mean_token_accuracy": 0.8367893837392331, | |
| "num_tokens": 500382331.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.7820136852394917, | |
| "grad_norm": 0.2697485453052082, | |
| "learning_rate": 7.27813140595565e-05, | |
| "loss": 0.5249, | |
| "mean_token_accuracy": 0.836308328807354, | |
| "num_tokens": 504972961.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.7891229005598507, | |
| "grad_norm": 0.47577994241811294, | |
| "learning_rate": 7.23044440901283e-05, | |
| "loss": 0.5386, | |
| "mean_token_accuracy": 0.832004614919424, | |
| "num_tokens": 509556175.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.7962321158802097, | |
| "grad_norm": 0.26812210950339255, | |
| "learning_rate": 7.182528258139563e-05, | |
| "loss": 0.5327, | |
| "mean_token_accuracy": 0.8331871695816517, | |
| "num_tokens": 514159170.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8033413312005687, | |
| "grad_norm": 0.2590503131411491, | |
| "learning_rate": 7.13438929862741e-05, | |
| "loss": 0.5447, | |
| "mean_token_accuracy": 0.8303000062704087, | |
| "num_tokens": 518758083.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8104505465209277, | |
| "grad_norm": 0.2700164600845211, | |
| "learning_rate": 7.086033905273344e-05, | |
| "loss": 0.5367, | |
| "mean_token_accuracy": 0.8323484763503075, | |
| "num_tokens": 523345629.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8175597618412868, | |
| "grad_norm": 0.26967028018820877, | |
| "learning_rate": 7.037468481535567e-05, | |
| "loss": 0.5212, | |
| "mean_token_accuracy": 0.8371426187455654, | |
| "num_tokens": 527940592.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.8246689771616458, | |
| "grad_norm": 0.3154368910279167, | |
| "learning_rate": 6.988699458685537e-05, | |
| "loss": 0.5275, | |
| "mean_token_accuracy": 0.8351783238351345, | |
| "num_tokens": 532516910.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.8317781924820048, | |
| "grad_norm": 0.26226153440650185, | |
| "learning_rate": 6.9397332949563e-05, | |
| "loss": 0.5335, | |
| "mean_token_accuracy": 0.8329351760447026, | |
| "num_tokens": 537121758.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.8388874078023638, | |
| "grad_norm": 0.31223173870328286, | |
| "learning_rate": 6.890576474687263e-05, | |
| "loss": 0.5458, | |
| "mean_token_accuracy": 0.829648780822754, | |
| "num_tokens": 541734519.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.8459966231227228, | |
| "grad_norm": 0.2565970150956528, | |
| "learning_rate": 6.841235507465515e-05, | |
| "loss": 0.5415, | |
| "mean_token_accuracy": 0.8324811846017838, | |
| "num_tokens": 546326546.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.8531058384430819, | |
| "grad_norm": 0.29462309278409743, | |
| "learning_rate": 6.791716927263778e-05, | |
| "loss": 0.5354, | |
| "mean_token_accuracy": 0.8325764186680317, | |
| "num_tokens": 550923667.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8531058384430819, | |
| "eval_loss": 0.5030205249786377, | |
| "eval_mean_token_accuracy": 0.8328834866222582, | |
| "eval_num_tokens": 550923667.0, | |
| "eval_runtime": 145.5099, | |
| "eval_samples_per_second": 25.002, | |
| "eval_steps_per_second": 0.783, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 0.2995740161508053, | |
| "learning_rate": 6.742027291575156e-05, | |
| "loss": 0.5351, | |
| "mean_token_accuracy": 0.8337548352777958, | |
| "num_tokens": 555521300.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.8673242690837999, | |
| "grad_norm": 0.256895454866442, | |
| "learning_rate": 6.692173180544768e-05, | |
| "loss": 0.527, | |
| "mean_token_accuracy": 0.8346491247415543, | |
| "num_tokens": 560114622.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.8744334844041589, | |
| "grad_norm": 0.26124663621839667, | |
| "learning_rate": 6.642161196098351e-05, | |
| "loss": 0.5299, | |
| "mean_token_accuracy": 0.835064522176981, | |
| "num_tokens": 564707120.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.8815426997245179, | |
| "grad_norm": 0.30629789668279445, | |
| "learning_rate": 6.591997961068024e-05, | |
| "loss": 0.5391, | |
| "mean_token_accuracy": 0.8325687229633332, | |
| "num_tokens": 569285949.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.888651915044877, | |
| "grad_norm": 0.2517010032545197, | |
| "learning_rate": 6.541690118315245e-05, | |
| "loss": 0.528, | |
| "mean_token_accuracy": 0.834906804561615, | |
| "num_tokens": 573871769.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.895761130365236, | |
| "grad_norm": 0.3714356282666368, | |
| "learning_rate": 6.491244329851133e-05, | |
| "loss": 0.521, | |
| "mean_token_accuracy": 0.8374850310385227, | |
| "num_tokens": 578461250.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.902870345685595, | |
| "grad_norm": 0.2513550517622928, | |
| "learning_rate": 6.440667275954262e-05, | |
| "loss": 0.5151, | |
| "mean_token_accuracy": 0.8384780243039132, | |
| "num_tokens": 583046607.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.909979561005954, | |
| "grad_norm": 0.2790784344252937, | |
| "learning_rate": 6.389965654286011e-05, | |
| "loss": 0.5287, | |
| "mean_token_accuracy": 0.8349935576319695, | |
| "num_tokens": 587648232.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.917088776326313, | |
| "grad_norm": 0.27767689120972117, | |
| "learning_rate": 6.339146179003636e-05, | |
| "loss": 0.5207, | |
| "mean_token_accuracy": 0.837136809527874, | |
| "num_tokens": 592239729.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.924197991646672, | |
| "grad_norm": 0.2805149976836277, | |
| "learning_rate": 6.288215579871148e-05, | |
| "loss": 0.5229, | |
| "mean_token_accuracy": 0.8374404884874821, | |
| "num_tokens": 596831306.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.9313072069670311, | |
| "grad_norm": 0.24703194529226574, | |
| "learning_rate": 6.23718060136812e-05, | |
| "loss": 0.5152, | |
| "mean_token_accuracy": 0.8385937295854091, | |
| "num_tokens": 601427733.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.9384164222873901, | |
| "grad_norm": 0.33949011504626453, | |
| "learning_rate": 6.186048001796556e-05, | |
| "loss": 0.5204, | |
| "mean_token_accuracy": 0.8384438544511795, | |
| "num_tokens": 606006466.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.945525637607749, | |
| "grad_norm": 0.24749318396547174, | |
| "learning_rate": 6.134824552385915e-05, | |
| "loss": 0.5256, | |
| "mean_token_accuracy": 0.8357278972864151, | |
| "num_tokens": 610597552.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.952634852928108, | |
| "grad_norm": 0.26267746218214755, | |
| "learning_rate": 6.0835170363964434e-05, | |
| "loss": 0.528, | |
| "mean_token_accuracy": 0.8351906433701515, | |
| "num_tokens": 615193994.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.959744068248467, | |
| "grad_norm": 0.25519090759528035, | |
| "learning_rate": 6.032132248220893e-05, | |
| "loss": 0.518, | |
| "mean_token_accuracy": 0.8378535941243171, | |
| "num_tokens": 619786315.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.966853283568826, | |
| "grad_norm": 0.25149430173186577, | |
| "learning_rate": 5.9806769924847784e-05, | |
| "loss": 0.5175, | |
| "mean_token_accuracy": 0.8372136250138282, | |
| "num_tokens": 624383919.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.9739624988891851, | |
| "grad_norm": 0.2669872598294479, | |
| "learning_rate": 5.929158083145271e-05, | |
| "loss": 0.5166, | |
| "mean_token_accuracy": 0.8380297608673573, | |
| "num_tokens": 628976906.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.9810717142095441, | |
| "grad_norm": 0.3079990980800955, | |
| "learning_rate": 5.8775823425888664e-05, | |
| "loss": 0.5171, | |
| "mean_token_accuracy": 0.8365243822336197, | |
| "num_tokens": 633557562.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.9881809295299031, | |
| "grad_norm": 0.26934237379344833, | |
| "learning_rate": 5.825956600727932e-05, | |
| "loss": 0.5176, | |
| "mean_token_accuracy": 0.8371751248836518, | |
| "num_tokens": 638143938.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.9952901448502621, | |
| "grad_norm": 0.24892879578477203, | |
| "learning_rate": 5.774287694096246e-05, | |
| "loss": 0.5203, | |
| "mean_token_accuracy": 0.8368992209434509, | |
| "num_tokens": 642760408.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9952901448502621, | |
| "eval_loss": 0.49169814586639404, | |
| "eval_mean_token_accuracy": 0.8366760449451313, | |
| "eval_num_tokens": 642760408.0, | |
| "eval_runtime": 148.141, | |
| "eval_samples_per_second": 24.558, | |
| "eval_steps_per_second": 0.77, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.0014218430640718, | |
| "grad_norm": 0.5358904769553885, | |
| "learning_rate": 5.72258246494368e-05, | |
| "loss": 0.4893, | |
| "mean_token_accuracy": 0.8436046752376832, | |
| "num_tokens": 646718128.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.008531058384431, | |
| "grad_norm": 0.25743890956382126, | |
| "learning_rate": 5.6708477603301146e-05, | |
| "loss": 0.461, | |
| "mean_token_accuracy": 0.8506338618695736, | |
| "num_tokens": 651304404.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.0156402737047898, | |
| "grad_norm": 0.2648866270558085, | |
| "learning_rate": 5.6190904312187154e-05, | |
| "loss": 0.4544, | |
| "mean_token_accuracy": 0.8519260853528976, | |
| "num_tokens": 655879909.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.022749489025149, | |
| "grad_norm": 0.27694330822934976, | |
| "learning_rate": 5.567317331568687e-05, | |
| "loss": 0.4474, | |
| "mean_token_accuracy": 0.8545098066329956, | |
| "num_tokens": 660449626.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.0298587043455079, | |
| "grad_norm": 0.24825528169946715, | |
| "learning_rate": 5.515535317427657e-05, | |
| "loss": 0.4517, | |
| "mean_token_accuracy": 0.8533940657973289, | |
| "num_tokens": 665058163.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.0369679196658668, | |
| "grad_norm": 0.24464581183689546, | |
| "learning_rate": 5.463751246023746e-05, | |
| "loss": 0.4559, | |
| "mean_token_accuracy": 0.8523735709488391, | |
| "num_tokens": 669654595.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.044077134986226, | |
| "grad_norm": 0.24930171479148333, | |
| "learning_rate": 5.4119719748575106e-05, | |
| "loss": 0.4487, | |
| "mean_token_accuracy": 0.8542089037597179, | |
| "num_tokens": 674232882.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.0511863503065848, | |
| "grad_norm": 0.23303088594874635, | |
| "learning_rate": 5.360204360793836e-05, | |
| "loss": 0.4436, | |
| "mean_token_accuracy": 0.8547257304191589, | |
| "num_tokens": 678813498.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.058295565626944, | |
| "grad_norm": 0.317097982341769, | |
| "learning_rate": 5.308455259153915e-05, | |
| "loss": 0.458, | |
| "mean_token_accuracy": 0.8515614397823811, | |
| "num_tokens": 683401148.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.0654047809473028, | |
| "grad_norm": 0.24160258781744343, | |
| "learning_rate": 5.256731522807436e-05, | |
| "loss": 0.4506, | |
| "mean_token_accuracy": 0.8526393964886665, | |
| "num_tokens": 687982154.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.072513996267662, | |
| "grad_norm": 0.23602108922437653, | |
| "learning_rate": 5.205040001265094e-05, | |
| "loss": 0.4515, | |
| "mean_token_accuracy": 0.8521531477570534, | |
| "num_tokens": 692583016.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.0796232115880209, | |
| "grad_norm": 0.2431546567595459, | |
| "learning_rate": 5.1533875397715345e-05, | |
| "loss": 0.455, | |
| "mean_token_accuracy": 0.8529531605541706, | |
| "num_tokens": 697183950.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.08673242690838, | |
| "grad_norm": 0.27597324346348756, | |
| "learning_rate": 5.101780978398888e-05, | |
| "loss": 0.4518, | |
| "mean_token_accuracy": 0.8528432317078114, | |
| "num_tokens": 701785548.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.093841642228739, | |
| "grad_norm": 0.26932926236063864, | |
| "learning_rate": 5.050227151140958e-05, | |
| "loss": 0.4536, | |
| "mean_token_accuracy": 0.852679468691349, | |
| "num_tokens": 706364188.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.100950857549098, | |
| "grad_norm": 0.2587220894683173, | |
| "learning_rate": 4.998732885008244e-05, | |
| "loss": 0.4503, | |
| "mean_token_accuracy": 0.8526183031499386, | |
| "num_tokens": 710949271.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.108060072869457, | |
| "grad_norm": 0.24430696998738718, | |
| "learning_rate": 4.947304999123867e-05, | |
| "loss": 0.4357, | |
| "mean_token_accuracy": 0.8572968378663063, | |
| "num_tokens": 715539336.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.115169288189816, | |
| "grad_norm": 0.24614402366250857, | |
| "learning_rate": 4.895950303820552e-05, | |
| "loss": 0.4525, | |
| "mean_token_accuracy": 0.8526603005826473, | |
| "num_tokens": 720147357.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.122278503510175, | |
| "grad_norm": 0.23262198319374294, | |
| "learning_rate": 4.844675599738765e-05, | |
| "loss": 0.4523, | |
| "mean_token_accuracy": 0.852922348678112, | |
| "num_tokens": 724741149.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.1293877188305341, | |
| "grad_norm": 0.2551816873924689, | |
| "learning_rate": 4.793487676926142e-05, | |
| "loss": 0.4562, | |
| "mean_token_accuracy": 0.8518377915024757, | |
| "num_tokens": 729327424.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.136496934150893, | |
| "grad_norm": 0.23754167080648592, | |
| "learning_rate": 4.742393313938327e-05, | |
| "loss": 0.445, | |
| "mean_token_accuracy": 0.8547273397445678, | |
| "num_tokens": 733921218.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.136496934150893, | |
| "eval_loss": 0.4879998564720154, | |
| "eval_mean_token_accuracy": 0.8380277005203983, | |
| "eval_num_tokens": 733921218.0, | |
| "eval_runtime": 146.7948, | |
| "eval_samples_per_second": 24.783, | |
| "eval_steps_per_second": 0.777, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.1436061494712522, | |
| "grad_norm": 0.25050469601877845, | |
| "learning_rate": 4.6913992769413026e-05, | |
| "loss": 0.4552, | |
| "mean_token_accuracy": 0.8521495588123799, | |
| "num_tokens": 738503816.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.150715364791611, | |
| "grad_norm": 0.24476661787598053, | |
| "learning_rate": 4.6405123188153966e-05, | |
| "loss": 0.4506, | |
| "mean_token_accuracy": 0.8532384999096394, | |
| "num_tokens": 743095770.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.1578245801119702, | |
| "grad_norm": 0.24115136773182058, | |
| "learning_rate": 4.589739178261028e-05, | |
| "loss": 0.4471, | |
| "mean_token_accuracy": 0.8549422182142734, | |
| "num_tokens": 747676184.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.1649337954323291, | |
| "grad_norm": 0.24283949811905522, | |
| "learning_rate": 4.5390865789063344e-05, | |
| "loss": 0.448, | |
| "mean_token_accuracy": 0.8543575026094914, | |
| "num_tokens": 752274534.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.1720430107526882, | |
| "grad_norm": 0.2701107129425895, | |
| "learning_rate": 4.4885612284167955e-05, | |
| "loss": 0.4411, | |
| "mean_token_accuracy": 0.8565104402601719, | |
| "num_tokens": 756863683.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.1791522260730471, | |
| "grad_norm": 0.2886054721404824, | |
| "learning_rate": 4.4381698176069754e-05, | |
| "loss": 0.4379, | |
| "mean_token_accuracy": 0.8567862503230572, | |
| "num_tokens": 761453110.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.1862614413934063, | |
| "grad_norm": 0.2561982737144238, | |
| "learning_rate": 4.387919019554487e-05, | |
| "loss": 0.4532, | |
| "mean_token_accuracy": 0.8531202852725983, | |
| "num_tokens": 766041248.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.1933706567137652, | |
| "grad_norm": 0.26412588441218454, | |
| "learning_rate": 4.3378154887163144e-05, | |
| "loss": 0.4453, | |
| "mean_token_accuracy": 0.853339533507824, | |
| "num_tokens": 770624920.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.2004798720341243, | |
| "grad_norm": 0.25032821222177587, | |
| "learning_rate": 4.287865860047596e-05, | |
| "loss": 0.4558, | |
| "mean_token_accuracy": 0.8522251404821872, | |
| "num_tokens": 775225729.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.2075890873544832, | |
| "grad_norm": 0.23998083533004458, | |
| "learning_rate": 4.2380767481229886e-05, | |
| "loss": 0.4418, | |
| "mean_token_accuracy": 0.8569207176566124, | |
| "num_tokens": 779811918.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.2146983026748424, | |
| "grad_norm": 0.2456015755421057, | |
| "learning_rate": 4.1884547462607326e-05, | |
| "loss": 0.4454, | |
| "mean_token_accuracy": 0.8553664483129978, | |
| "num_tokens": 784391305.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.2218075179952013, | |
| "grad_norm": 0.25612737416807746, | |
| "learning_rate": 4.139006425649541e-05, | |
| "loss": 0.4504, | |
| "mean_token_accuracy": 0.8527485050261021, | |
| "num_tokens": 788981682.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.2289167333155602, | |
| "grad_norm": 0.24215144672428524, | |
| "learning_rate": 4.089738334478399e-05, | |
| "loss": 0.4466, | |
| "mean_token_accuracy": 0.8540120802819728, | |
| "num_tokens": 793548878.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.2360259486359193, | |
| "grad_norm": 0.251956160570565, | |
| "learning_rate": 4.0406569970694285e-05, | |
| "loss": 0.4514, | |
| "mean_token_accuracy": 0.8536942526698112, | |
| "num_tokens": 798145090.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.2431351639562784, | |
| "grad_norm": 0.24137828427946414, | |
| "learning_rate": 3.991768913013904e-05, | |
| "loss": 0.4408, | |
| "mean_token_accuracy": 0.8566184468567372, | |
| "num_tokens": 802721141.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.2502443792766373, | |
| "grad_norm": 0.3769699788745637, | |
| "learning_rate": 3.943080556311536e-05, | |
| "loss": 0.438, | |
| "mean_token_accuracy": 0.8581221453845501, | |
| "num_tokens": 807303824.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.2573535945969962, | |
| "grad_norm": 0.251278759950789, | |
| "learning_rate": 3.894598374513174e-05, | |
| "loss": 0.4485, | |
| "mean_token_accuracy": 0.8541063219308853, | |
| "num_tokens": 811911762.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.2644628099173554, | |
| "grad_norm": 0.24068163801342848, | |
| "learning_rate": 3.846328787866964e-05, | |
| "loss": 0.4339, | |
| "mean_token_accuracy": 0.859130322188139, | |
| "num_tokens": 816508640.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.2715720252377145, | |
| "grad_norm": 0.23232711368022352, | |
| "learning_rate": 3.798278188468164e-05, | |
| "loss": 0.4445, | |
| "mean_token_accuracy": 0.8543654963374138, | |
| "num_tokens": 821100737.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.2786812405580734, | |
| "grad_norm": 0.2368572559014999, | |
| "learning_rate": 3.750452939412667e-05, | |
| "loss": 0.4434, | |
| "mean_token_accuracy": 0.8547687388956546, | |
| "num_tokens": 825694727.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2786812405580734, | |
| "eval_loss": 0.4800785183906555, | |
| "eval_mean_token_accuracy": 0.8407511988229919, | |
| "eval_num_tokens": 825694727.0, | |
| "eval_runtime": 146.4602, | |
| "eval_samples_per_second": 24.84, | |
| "eval_steps_per_second": 0.778, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2857904558784323, | |
| "grad_norm": 0.26166517034573067, | |
| "learning_rate": 3.7028593739543715e-05, | |
| "loss": 0.4475, | |
| "mean_token_accuracy": 0.854764747619629, | |
| "num_tokens": 830291180.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.2928996711987915, | |
| "grad_norm": 0.24015937616460478, | |
| "learning_rate": 3.6555037946664926e-05, | |
| "loss": 0.4455, | |
| "mean_token_accuracy": 0.8552566647529602, | |
| "num_tokens": 834892125.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.3000088865191506, | |
| "grad_norm": 0.252313420976958, | |
| "learning_rate": 3.608392472606956e-05, | |
| "loss": 0.4441, | |
| "mean_token_accuracy": 0.8559129044413567, | |
| "num_tokens": 839486375.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.3071181018395095, | |
| "grad_norm": 0.256487918121681, | |
| "learning_rate": 3.5615316464879445e-05, | |
| "loss": 0.4401, | |
| "mean_token_accuracy": 0.8565216913819313, | |
| "num_tokens": 844107444.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.3142273171598684, | |
| "grad_norm": 0.23448215102314007, | |
| "learning_rate": 3.5149275218497445e-05, | |
| "loss": 0.4383, | |
| "mean_token_accuracy": 0.8571599997580052, | |
| "num_tokens": 848704492.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.3213365324802275, | |
| "grad_norm": 0.24419792529251788, | |
| "learning_rate": 3.4685862702389714e-05, | |
| "loss": 0.4429, | |
| "mean_token_accuracy": 0.855844734609127, | |
| "num_tokens": 853292585.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.3284457478005864, | |
| "grad_norm": 0.23566825561303636, | |
| "learning_rate": 3.422514028391304e-05, | |
| "loss": 0.4354, | |
| "mean_token_accuracy": 0.8570930063724518, | |
| "num_tokens": 857867604.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.3355549631209456, | |
| "grad_norm": 0.2454162982602229, | |
| "learning_rate": 3.376716897418831e-05, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.8552064374089241, | |
| "num_tokens": 862460961.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.3426641784413045, | |
| "grad_norm": 0.2524163496767361, | |
| "learning_rate": 3.331200942002113e-05, | |
| "loss": 0.4525, | |
| "mean_token_accuracy": 0.8537895001471043, | |
| "num_tokens": 867058298.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.3497733937616636, | |
| "grad_norm": 0.23190520165291026, | |
| "learning_rate": 3.2859721895870635e-05, | |
| "loss": 0.44, | |
| "mean_token_accuracy": 0.8565752863883972, | |
| "num_tokens": 871661806.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.3568826090820225, | |
| "grad_norm": 0.24782970977401894, | |
| "learning_rate": 3.2410366295867664e-05, | |
| "loss": 0.4352, | |
| "mean_token_accuracy": 0.8579383887350559, | |
| "num_tokens": 876250262.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.3639918244023816, | |
| "grad_norm": 0.22786025696468146, | |
| "learning_rate": 3.19640021258833e-05, | |
| "loss": 0.444, | |
| "mean_token_accuracy": 0.8550498209893703, | |
| "num_tokens": 880839029.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.3711010397227406, | |
| "grad_norm": 0.2265711418699179, | |
| "learning_rate": 3.152068849564879e-05, | |
| "loss": 0.4435, | |
| "mean_token_accuracy": 0.8563594095408916, | |
| "num_tokens": 885417939.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.3782102550430997, | |
| "grad_norm": 0.23977507514952898, | |
| "learning_rate": 3.1080484110927954e-05, | |
| "loss": 0.4325, | |
| "mean_token_accuracy": 0.8590381443500519, | |
| "num_tokens": 890005207.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.3853194703634586, | |
| "grad_norm": 0.24689756755824815, | |
| "learning_rate": 3.0643447265743096e-05, | |
| "loss": 0.44, | |
| "mean_token_accuracy": 0.85642144754529, | |
| "num_tokens": 894591297.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.3924286856838177, | |
| "grad_norm": 0.24051873631020942, | |
| "learning_rate": 3.0209635834655392e-05, | |
| "loss": 0.435, | |
| "mean_token_accuracy": 0.8576522074639797, | |
| "num_tokens": 899178832.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.3995379010041766, | |
| "grad_norm": 0.2413492029135495, | |
| "learning_rate": 2.9779107265100892e-05, | |
| "loss": 0.4369, | |
| "mean_token_accuracy": 0.857710150629282, | |
| "num_tokens": 903773147.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.4066471163245358, | |
| "grad_norm": 0.23506138046697497, | |
| "learning_rate": 2.9351918569783006e-05, | |
| "loss": 0.4364, | |
| "mean_token_accuracy": 0.8576699584722519, | |
| "num_tokens": 908371284.0, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.4137563316448947, | |
| "grad_norm": 0.25438867805085685, | |
| "learning_rate": 2.892812631912265e-05, | |
| "loss": 0.4349, | |
| "mean_token_accuracy": 0.8586409255862236, | |
| "num_tokens": 912978481.0, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.4208655469652536, | |
| "grad_norm": 0.24429497699288996, | |
| "learning_rate": 2.8507786633766877e-05, | |
| "loss": 0.4354, | |
| "mean_token_accuracy": 0.8573046490550041, | |
| "num_tokens": 917574029.0, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4208655469652536, | |
| "eval_loss": 0.47304314374923706, | |
| "eval_mean_token_accuracy": 0.842672534156264, | |
| "eval_num_tokens": 917574029.0, | |
| "eval_runtime": 145.3562, | |
| "eval_samples_per_second": 25.028, | |
| "eval_steps_per_second": 0.784, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4279747622856127, | |
| "grad_norm": 0.24463063083449332, | |
| "learning_rate": 2.809095517715713e-05, | |
| "loss": 0.4303, | |
| "mean_token_accuracy": 0.858917984366417, | |
| "num_tokens": 922160147.0, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.4350839776059718, | |
| "grad_norm": 0.24348846567727375, | |
| "learning_rate": 2.7677687148157998e-05, | |
| "loss": 0.4367, | |
| "mean_token_accuracy": 0.8577364660799504, | |
| "num_tokens": 926746028.0, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.4421931929263307, | |
| "grad_norm": 0.24745049020205356, | |
| "learning_rate": 2.7268037273747525e-05, | |
| "loss": 0.4368, | |
| "mean_token_accuracy": 0.857840034365654, | |
| "num_tokens": 931337261.0, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.4493024082466897, | |
| "grad_norm": 0.2439587698234042, | |
| "learning_rate": 2.686205980176998e-05, | |
| "loss": 0.4447, | |
| "mean_token_accuracy": 0.8548872321844101, | |
| "num_tokens": 935941769.0, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.4564116235670488, | |
| "grad_norm": 0.25142114078442956, | |
| "learning_rate": 2.6459808493752102e-05, | |
| "loss": 0.4284, | |
| "mean_token_accuracy": 0.8603815868496895, | |
| "num_tokens": 940535643.0, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.463520838887408, | |
| "grad_norm": 0.2444154895688051, | |
| "learning_rate": 2.606133661778377e-05, | |
| "loss": 0.4368, | |
| "mean_token_accuracy": 0.8575351513922215, | |
| "num_tokens": 945124519.0, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.4706300542077668, | |
| "grad_norm": 0.2397327728518288, | |
| "learning_rate": 2.5666696941463885e-05, | |
| "loss": 0.4307, | |
| "mean_token_accuracy": 0.8594269149005413, | |
| "num_tokens": 949709974.0, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.4777392695281257, | |
| "grad_norm": 0.3077470484547689, | |
| "learning_rate": 2.5275941724912743e-05, | |
| "loss": 0.4288, | |
| "mean_token_accuracy": 0.8588724002242089, | |
| "num_tokens": 954294899.0, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.4848484848484849, | |
| "grad_norm": 0.24584716924955974, | |
| "learning_rate": 2.4889122713851394e-05, | |
| "loss": 0.4304, | |
| "mean_token_accuracy": 0.8590269833803177, | |
| "num_tokens": 958889833.0, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.491957700168844, | |
| "grad_norm": 0.24260820183680837, | |
| "learning_rate": 2.4506291132749272e-05, | |
| "loss": 0.4322, | |
| "mean_token_accuracy": 0.8588926158845425, | |
| "num_tokens": 963479630.0, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.499066915489203, | |
| "grad_norm": 0.2512439219193439, | |
| "learning_rate": 2.4127497678040846e-05, | |
| "loss": 0.4338, | |
| "mean_token_accuracy": 0.8590321697294712, | |
| "num_tokens": 968086693.0, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.5061761308095618, | |
| "grad_norm": 0.25788120133019554, | |
| "learning_rate": 2.375279251141201e-05, | |
| "loss": 0.4302, | |
| "mean_token_accuracy": 0.8599278099834919, | |
| "num_tokens": 972668807.0, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.513285346129921, | |
| "grad_norm": 0.24857387974370135, | |
| "learning_rate": 2.338222525315758e-05, | |
| "loss": 0.4371, | |
| "mean_token_accuracy": 0.8579599760472775, | |
| "num_tokens": 977267842.0, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.52039456145028, | |
| "grad_norm": 0.24022880991860499, | |
| "learning_rate": 2.301584497561024e-05, | |
| "loss": 0.4234, | |
| "mean_token_accuracy": 0.862085721641779, | |
| "num_tokens": 981857003.0, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.527503776770639, | |
| "grad_norm": 0.27120541109477303, | |
| "learning_rate": 2.2653700196642134e-05, | |
| "loss": 0.4396, | |
| "mean_token_accuracy": 0.857264555990696, | |
| "num_tokens": 986456929.0, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.5346129920909979, | |
| "grad_norm": 0.24114703590240177, | |
| "learning_rate": 2.2295838873239965e-05, | |
| "loss": 0.4296, | |
| "mean_token_accuracy": 0.8604548752307892, | |
| "num_tokens": 991061372.0, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.541722207411357, | |
| "grad_norm": 0.23963844839444817, | |
| "learning_rate": 2.194230839515425e-05, | |
| "loss": 0.4336, | |
| "mean_token_accuracy": 0.8584208697080612, | |
| "num_tokens": 995660319.0, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.5488314227317161, | |
| "grad_norm": 0.24314988814533856, | |
| "learning_rate": 2.1593155578623702e-05, | |
| "loss": 0.4306, | |
| "mean_token_accuracy": 0.8601135425269604, | |
| "num_tokens": 1000236933.0, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.555940638052075, | |
| "grad_norm": 0.2566886574453899, | |
| "learning_rate": 2.1248426660175713e-05, | |
| "loss": 0.4384, | |
| "mean_token_accuracy": 0.8573588460683823, | |
| "num_tokens": 1004820862.0, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.563049853372434, | |
| "grad_norm": 0.2621075128506793, | |
| "learning_rate": 2.0908167290503326e-05, | |
| "loss": 0.4298, | |
| "mean_token_accuracy": 0.8607131637632847, | |
| "num_tokens": 1009411521.0, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.563049853372434, | |
| "eval_loss": 0.4672245681285858, | |
| "eval_mean_token_accuracy": 0.844007690747579, | |
| "eval_num_tokens": 1009411521.0, | |
| "eval_runtime": 146.3617, | |
| "eval_samples_per_second": 24.856, | |
| "eval_steps_per_second": 0.779, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.570159068692793, | |
| "grad_norm": 0.23570827346042514, | |
| "learning_rate": 2.0572422528420095e-05, | |
| "loss": 0.4206, | |
| "mean_token_accuracy": 0.8622309692203999, | |
| "num_tokens": 1013995376.0, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.577268284013152, | |
| "grad_norm": 0.28786088105829327, | |
| "learning_rate": 2.024123683489303e-05, | |
| "loss": 0.4195, | |
| "mean_token_accuracy": 0.8634026922285557, | |
| "num_tokens": 1018562407.0, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.584377499333511, | |
| "grad_norm": 0.22477409346403396, | |
| "learning_rate": 1.9914654067154996e-05, | |
| "loss": 0.4345, | |
| "mean_token_accuracy": 0.8584335811436177, | |
| "num_tokens": 1023168118.0, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.59148671465387, | |
| "grad_norm": 0.24599345473106599, | |
| "learning_rate": 1.959271747289686e-05, | |
| "loss": 0.4278, | |
| "mean_token_accuracy": 0.8616135574877262, | |
| "num_tokens": 1027754848.0, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.5985959299742292, | |
| "grad_norm": 0.24491593894054278, | |
| "learning_rate": 1.9275469684540404e-05, | |
| "loss": 0.4294, | |
| "mean_token_accuracy": 0.8590353332459927, | |
| "num_tokens": 1032347251.0, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.605705145294588, | |
| "grad_norm": 0.2540751338276317, | |
| "learning_rate": 1.8962952713592752e-05, | |
| "loss": 0.4242, | |
| "mean_token_accuracy": 0.8608104437589645, | |
| "num_tokens": 1036931829.0, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.612814360614947, | |
| "grad_norm": 0.2510287685288083, | |
| "learning_rate": 1.8655207945083e-05, | |
| "loss": 0.4239, | |
| "mean_token_accuracy": 0.8617179103195667, | |
| "num_tokens": 1041532224.0, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.6199235759353061, | |
| "grad_norm": 0.2693350827409704, | |
| "learning_rate": 1.8352276132081847e-05, | |
| "loss": 0.4357, | |
| "mean_token_accuracy": 0.8589904353022575, | |
| "num_tokens": 1046120676.0, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.6270327912556652, | |
| "grad_norm": 0.24443054034299724, | |
| "learning_rate": 1.8054197390304755e-05, | |
| "loss": 0.4275, | |
| "mean_token_accuracy": 0.8615889854729175, | |
| "num_tokens": 1050708153.0, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.6341420065760242, | |
| "grad_norm": 0.24588007040764026, | |
| "learning_rate": 1.7761011192799764e-05, | |
| "loss": 0.4238, | |
| "mean_token_accuracy": 0.8622479006648064, | |
| "num_tokens": 1055294826.0, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.641251221896383, | |
| "grad_norm": 0.24561473837992528, | |
| "learning_rate": 1.7472756364720206e-05, | |
| "loss": 0.4243, | |
| "mean_token_accuracy": 0.8616314500570297, | |
| "num_tokens": 1059896792.0, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.6483604372167422, | |
| "grad_norm": 0.23202476301237993, | |
| "learning_rate": 1.7189471078183302e-05, | |
| "loss": 0.4313, | |
| "mean_token_accuracy": 0.860023857653141, | |
| "num_tokens": 1064504870.0, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.6554696525371013, | |
| "grad_norm": 0.2403111932989795, | |
| "learning_rate": 1.6911192847215225e-05, | |
| "loss": 0.4315, | |
| "mean_token_accuracy": 0.85991101115942, | |
| "num_tokens": 1069092813.0, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.6625788678574602, | |
| "grad_norm": 0.23285052418281263, | |
| "learning_rate": 1.6637958522783298e-05, | |
| "loss": 0.4286, | |
| "mean_token_accuracy": 0.8603983536362648, | |
| "num_tokens": 1073673087.0, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.6696880831778191, | |
| "grad_norm": 0.23644436345090544, | |
| "learning_rate": 1.6369804287916028e-05, | |
| "loss": 0.4237, | |
| "mean_token_accuracy": 0.8625174552202225, | |
| "num_tokens": 1078263989.0, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.6767972984981783, | |
| "grad_norm": 0.2283809036559784, | |
| "learning_rate": 1.6106765652911563e-05, | |
| "loss": 0.4196, | |
| "mean_token_accuracy": 0.8629219397902489, | |
| "num_tokens": 1082858600.0, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.6839065138185374, | |
| "grad_norm": 0.2437421457507895, | |
| "learning_rate": 1.5848877450635237e-05, | |
| "loss": 0.431, | |
| "mean_token_accuracy": 0.8596989519894123, | |
| "num_tokens": 1087463215.0, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.6910157291388963, | |
| "grad_norm": 0.24997191755310427, | |
| "learning_rate": 1.559617383190684e-05, | |
| "loss": 0.4258, | |
| "mean_token_accuracy": 0.8600839108228684, | |
| "num_tokens": 1092046691.0, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.6981249444592552, | |
| "grad_norm": 0.24275510902589129, | |
| "learning_rate": 1.5348688260978188e-05, | |
| "loss": 0.4198, | |
| "mean_token_accuracy": 0.8634254619479179, | |
| "num_tokens": 1096635412.0, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.7052341597796143, | |
| "grad_norm": 0.25771028141912433, | |
| "learning_rate": 1.5106453511101657e-05, | |
| "loss": 0.4198, | |
| "mean_token_accuracy": 0.8630197443068027, | |
| "num_tokens": 1101239957.0, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.7052341597796143, | |
| "eval_loss": 0.4617161452770233, | |
| "eval_mean_token_accuracy": 0.8460459296117749, | |
| "eval_num_tokens": 1101239957.0, | |
| "eval_runtime": 143.0225, | |
| "eval_samples_per_second": 25.437, | |
| "eval_steps_per_second": 0.797, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.7123433750999735, | |
| "grad_norm": 0.2465846462175401, | |
| "learning_rate": 1.4869501660190118e-05, | |
| "loss": 0.4269, | |
| "mean_token_accuracy": 0.8613091327250004, | |
| "num_tokens": 1105835727.0, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.7194525904203324, | |
| "grad_norm": 0.24343231445496366, | |
| "learning_rate": 1.4637864086569114e-05, | |
| "loss": 0.4189, | |
| "mean_token_accuracy": 0.8625466778874398, | |
| "num_tokens": 1110431832.0, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.7265618057406913, | |
| "grad_norm": 0.24500024608031826, | |
| "learning_rate": 1.4411571464821522e-05, | |
| "loss": 0.4178, | |
| "mean_token_accuracy": 0.8632443450391293, | |
| "num_tokens": 1115003545.0, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.7336710210610504, | |
| "grad_norm": 0.24384954499049283, | |
| "learning_rate": 1.4190653761725458e-05, | |
| "loss": 0.4331, | |
| "mean_token_accuracy": 0.8595723591744899, | |
| "num_tokens": 1119594038.0, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.7407802363814096, | |
| "grad_norm": 0.24988962843301607, | |
| "learning_rate": 1.3975140232286033e-05, | |
| "loss": 0.4292, | |
| "mean_token_accuracy": 0.8610283821821213, | |
| "num_tokens": 1124191272.0, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.7478894517017685, | |
| "grad_norm": 0.23666630913921613, | |
| "learning_rate": 1.3765059415861142e-05, | |
| "loss": 0.4256, | |
| "mean_token_accuracy": 0.8612963631749153, | |
| "num_tokens": 1128787024.0, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.7549986670221274, | |
| "grad_norm": 0.24377997978707636, | |
| "learning_rate": 1.3560439132382218e-05, | |
| "loss": 0.4249, | |
| "mean_token_accuracy": 0.8616208277642727, | |
| "num_tokens": 1133369468.0, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.7621078823424865, | |
| "grad_norm": 0.24473326280197544, | |
| "learning_rate": 1.336130647867015e-05, | |
| "loss": 0.4233, | |
| "mean_token_accuracy": 0.8611096739768982, | |
| "num_tokens": 1137960753.0, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.7692170976628456, | |
| "grad_norm": 0.2814923829698822, | |
| "learning_rate": 1.3167687824846988e-05, | |
| "loss": 0.4345, | |
| "mean_token_accuracy": 0.8590093135833741, | |
| "num_tokens": 1142557989.0, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.7763263129832043, | |
| "grad_norm": 0.24671237642090413, | |
| "learning_rate": 1.297960881084391e-05, | |
| "loss": 0.4136, | |
| "mean_token_accuracy": 0.8641826197504997, | |
| "num_tokens": 1147139033.0, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.7834355283035634, | |
| "grad_norm": 0.23802525665842986, | |
| "learning_rate": 1.2797094343005807e-05, | |
| "loss": 0.4212, | |
| "mean_token_accuracy": 0.8627298250794411, | |
| "num_tokens": 1151728912.0, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.7905447436239226, | |
| "grad_norm": 0.24514167574215462, | |
| "learning_rate": 1.2620168590793105e-05, | |
| "loss": 0.4243, | |
| "mean_token_accuracy": 0.8623115479946136, | |
| "num_tokens": 1156315343.0, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.7976539589442815, | |
| "grad_norm": 0.24177052216503225, | |
| "learning_rate": 1.2448854983581134e-05, | |
| "loss": 0.4205, | |
| "mean_token_accuracy": 0.8636125177145004, | |
| "num_tokens": 1160905222.0, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.8047631742646404, | |
| "grad_norm": 0.25623340057701793, | |
| "learning_rate": 1.2283176207557455e-05, | |
| "loss": 0.4204, | |
| "mean_token_accuracy": 0.863289151340723, | |
| "num_tokens": 1165469584.0, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.8118723895849995, | |
| "grad_norm": 0.2366529819101992, | |
| "learning_rate": 1.2123154202717656e-05, | |
| "loss": 0.4205, | |
| "mean_token_accuracy": 0.8623673833906651, | |
| "num_tokens": 1170087058.0, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.8189816049053587, | |
| "grad_norm": 0.23815408906221286, | |
| "learning_rate": 1.1968810159959982e-05, | |
| "loss": 0.4167, | |
| "mean_token_accuracy": 0.8636409521102906, | |
| "num_tokens": 1174675450.0, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.8260908202257176, | |
| "grad_norm": 0.25161717096488057, | |
| "learning_rate": 1.1820164518279083e-05, | |
| "loss": 0.4308, | |
| "mean_token_accuracy": 0.8603747352957726, | |
| "num_tokens": 1179252086.0, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 1.8332000355460765, | |
| "grad_norm": 0.23828924023109987, | |
| "learning_rate": 1.1677236962059421e-05, | |
| "loss": 0.4161, | |
| "mean_token_accuracy": 0.8636845953762531, | |
| "num_tokens": 1183846581.0, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.8403092508664356, | |
| "grad_norm": 0.2389439298878492, | |
| "learning_rate": 1.1540046418468561e-05, | |
| "loss": 0.4093, | |
| "mean_token_accuracy": 0.8666847251355648, | |
| "num_tokens": 1188439447.0, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 1.8474184661867947, | |
| "grad_norm": 0.26036762406039, | |
| "learning_rate": 1.1408611054950722e-05, | |
| "loss": 0.4187, | |
| "mean_token_accuracy": 0.8630855195224285, | |
| "num_tokens": 1193031482.0, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.8474184661867947, | |
| "eval_loss": 0.45738622546195984, | |
| "eval_mean_token_accuracy": 0.847679163803134, | |
| "eval_num_tokens": 1193031482.0, | |
| "eval_runtime": 143.6355, | |
| "eval_samples_per_second": 25.328, | |
| "eval_steps_per_second": 0.794, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.8545276815071536, | |
| "grad_norm": 0.2419491832206913, | |
| "learning_rate": 1.1282948276820963e-05, | |
| "loss": 0.4223, | |
| "mean_token_accuracy": 0.8626484178006649, | |
| "num_tokens": 1197621510.0, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 1.8616368968275125, | |
| "grad_norm": 0.2366717377397619, | |
| "learning_rate": 1.1163074724960326e-05, | |
| "loss": 0.4202, | |
| "mean_token_accuracy": 0.8629304811358451, | |
| "num_tokens": 1202214988.0, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.8687461121478717, | |
| "grad_norm": 0.24750576690261594, | |
| "learning_rate": 1.10490062736121e-05, | |
| "loss": 0.4159, | |
| "mean_token_accuracy": 0.8640658937394619, | |
| "num_tokens": 1206801749.0, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 1.8758553274682308, | |
| "grad_norm": 0.2754980560042937, | |
| "learning_rate": 1.094075802827971e-05, | |
| "loss": 0.4224, | |
| "mean_token_accuracy": 0.8619605071842671, | |
| "num_tokens": 1211394066.0, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.8829645427885897, | |
| "grad_norm": 0.2441756409539309, | |
| "learning_rate": 1.0838344323726395e-05, | |
| "loss": 0.4159, | |
| "mean_token_accuracy": 0.8641899891197682, | |
| "num_tokens": 1215982389.0, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 1.8900737581089486, | |
| "grad_norm": 0.25017331261640485, | |
| "learning_rate": 1.0741778722076896e-05, | |
| "loss": 0.4141, | |
| "mean_token_accuracy": 0.864534319192171, | |
| "num_tokens": 1220561480.0, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.8971829734293078, | |
| "grad_norm": 0.24928323459761015, | |
| "learning_rate": 1.0651074011021495e-05, | |
| "loss": 0.4148, | |
| "mean_token_accuracy": 0.8647311642765999, | |
| "num_tokens": 1225151015.0, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 1.9042921887496669, | |
| "grad_norm": 0.26117744577378244, | |
| "learning_rate": 1.056624220212263e-05, | |
| "loss": 0.4227, | |
| "mean_token_accuracy": 0.8627439729869366, | |
| "num_tokens": 1229753553.0, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.9114014040700258, | |
| "grad_norm": 0.250926981430339, | |
| "learning_rate": 1.048729452922423e-05, | |
| "loss": 0.4118, | |
| "mean_token_accuracy": 0.8654024370014668, | |
| "num_tokens": 1234324722.0, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 1.9185106193903847, | |
| "grad_norm": 0.26445464932369295, | |
| "learning_rate": 1.0414241446964102e-05, | |
| "loss": 0.4176, | |
| "mean_token_accuracy": 0.8638374984264374, | |
| "num_tokens": 1238945254.0, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.9256198347107438, | |
| "grad_norm": 0.24942959940503223, | |
| "learning_rate": 1.0347092629389484e-05, | |
| "loss": 0.4098, | |
| "mean_token_accuracy": 0.8681537143886089, | |
| "num_tokens": 1243530120.0, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 1.932729050031103, | |
| "grad_norm": 0.25517475920539473, | |
| "learning_rate": 1.0285856968675917e-05, | |
| "loss": 0.4104, | |
| "mean_token_accuracy": 0.8657238759100437, | |
| "num_tokens": 1248126495.0, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.9398382653514619, | |
| "grad_norm": 0.24624704699692396, | |
| "learning_rate": 1.0230542573949747e-05, | |
| "loss": 0.4053, | |
| "mean_token_accuracy": 0.8677756235003471, | |
| "num_tokens": 1252728208.0, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 1.9469474806718208, | |
| "grad_norm": 0.24811417447193737, | |
| "learning_rate": 1.0181156770214243e-05, | |
| "loss": 0.4193, | |
| "mean_token_accuracy": 0.8637429274618625, | |
| "num_tokens": 1257314007.0, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.95405669599218, | |
| "grad_norm": 0.2553291480205661, | |
| "learning_rate": 1.013770609737961e-05, | |
| "loss": 0.4153, | |
| "mean_token_accuracy": 0.8649327427148819, | |
| "num_tokens": 1261908378.0, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 1.961165911312539, | |
| "grad_norm": 0.24846642652489853, | |
| "learning_rate": 1.010019630939691e-05, | |
| "loss": 0.4204, | |
| "mean_token_accuracy": 0.8626691080629826, | |
| "num_tokens": 1266492690.0, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.968275126632898, | |
| "grad_norm": 0.24853442428779762, | |
| "learning_rate": 1.0068632373496125e-05, | |
| "loss": 0.4213, | |
| "mean_token_accuracy": 0.862095658481121, | |
| "num_tokens": 1271089050.0, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 1.9753843419532569, | |
| "grad_norm": 0.25447008393745496, | |
| "learning_rate": 1.0043018469528365e-05, | |
| "loss": 0.4186, | |
| "mean_token_accuracy": 0.8638553529977798, | |
| "num_tokens": 1275693685.0, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.982493557273616, | |
| "grad_norm": 0.25146974784680387, | |
| "learning_rate": 1.0023357989412332e-05, | |
| "loss": 0.4132, | |
| "mean_token_accuracy": 0.8654829584062099, | |
| "num_tokens": 1280282291.0, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 1.9896027725939749, | |
| "grad_norm": 0.25186861166219776, | |
| "learning_rate": 1.000965353668517e-05, | |
| "loss": 0.4097, | |
| "mean_token_accuracy": 0.8660168826580048, | |
| "num_tokens": 1284878893.0, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.9896027725939749, | |
| "eval_loss": 0.45450538396835327, | |
| "eval_mean_token_accuracy": 0.8486974662856052, | |
| "eval_num_tokens": 1284878893.0, | |
| "eval_runtime": 143.4865, | |
| "eval_samples_per_second": 25.354, | |
| "eval_steps_per_second": 0.794, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.9967119879143338, | |
| "grad_norm": 0.2548741967506241, | |
| "learning_rate": 1.0001906926157681e-05, | |
| "loss": 0.4088, | |
| "mean_token_accuracy": 0.8670746453106404, | |
| "num_tokens": 1289465244.0, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "mean_token_accuracy": 0.8681698522052249, | |
| "num_tokens": 1291584473.0, | |
| "step": 1408, | |
| "total_flos": 9795365997903872.0, | |
| "train_loss": 0.5166227378120477, | |
| "train_runtime": 48333.5779, | |
| "train_samples_per_second": 14.899, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1408, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9795365997903872.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |