| { | |
| "best_global_step": 43, | |
| "best_metric": 3.79597425, | |
| "best_model_checkpoint": "/workspace/output/v0-20250510-202602/checkpoint-43", | |
| "epoch": 0.9842632331902719, | |
| "eval_steps": 200, | |
| "global_step": 43, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022889842632331903, | |
| "grad_norm": 0.5022401213645935, | |
| "learning_rate": 2.5e-05, | |
| "loss": 5.9138689041137695, | |
| "memory(GiB)": 22.25, | |
| "step": 1, | |
| "token_acc": 0.2735191637630662, | |
| "train_speed(iter/s)": 0.017618 | |
| }, | |
| { | |
| "epoch": 0.045779685264663805, | |
| "grad_norm": 0.4973876178264618, | |
| "learning_rate": 5e-05, | |
| "loss": 6.206646919250488, | |
| "memory(GiB)": 22.25, | |
| "step": 2, | |
| "token_acc": 0.25411334552102377, | |
| "train_speed(iter/s)": 0.024222 | |
| }, | |
| { | |
| "epoch": 0.06866952789699571, | |
| "grad_norm": 0.520351767539978, | |
| "learning_rate": 4.992664502959351e-05, | |
| "loss": 5.884594917297363, | |
| "memory(GiB)": 22.25, | |
| "step": 3, | |
| "token_acc": 0.26119402985074625, | |
| "train_speed(iter/s)": 0.027671 | |
| }, | |
| { | |
| "epoch": 0.09155937052932761, | |
| "grad_norm": 0.6917837262153625, | |
| "learning_rate": 4.970701059450872e-05, | |
| "loss": 5.813294887542725, | |
| "memory(GiB)": 22.25, | |
| "step": 4, | |
| "token_acc": 0.2789115646258503, | |
| "train_speed(iter/s)": 0.02975 | |
| }, | |
| { | |
| "epoch": 0.11444921316165951, | |
| "grad_norm": 0.8174898028373718, | |
| "learning_rate": 4.934238559694448e-05, | |
| "loss": 6.142425537109375, | |
| "memory(GiB)": 22.25, | |
| "step": 5, | |
| "token_acc": 0.20984455958549222, | |
| "train_speed(iter/s)": 0.031187 | |
| }, | |
| { | |
| "epoch": 0.13733905579399142, | |
| "grad_norm": 0.5081659555435181, | |
| "learning_rate": 4.8834909801373264e-05, | |
| "loss": 5.509262561798096, | |
| "memory(GiB)": 22.25, | |
| "step": 6, | |
| "token_acc": 0.29264214046822745, | |
| "train_speed(iter/s)": 0.032136 | |
| }, | |
| { | |
| "epoch": 0.16022889842632332, | |
| "grad_norm": 0.5285544395446777, | |
| "learning_rate": 4.8187561277552374e-05, | |
| "loss": 5.453015327453613, | |
| "memory(GiB)": 22.25, | |
| "step": 7, | |
| "token_acc": 0.33134328358208953, | |
| "train_speed(iter/s)": 0.032853 | |
| }, | |
| { | |
| "epoch": 0.18311874105865522, | |
| "grad_norm": 0.6126793026924133, | |
| "learning_rate": 4.740413892402639e-05, | |
| "loss": 5.514800071716309, | |
| "memory(GiB)": 22.25, | |
| "step": 8, | |
| "token_acc": 0.24347826086956523, | |
| "train_speed(iter/s)": 0.033481 | |
| }, | |
| { | |
| "epoch": 0.20600858369098712, | |
| "grad_norm": 0.5079677104949951, | |
| "learning_rate": 4.648924017468003e-05, | |
| "loss": 5.397139549255371, | |
| "memory(GiB)": 22.25, | |
| "step": 9, | |
| "token_acc": 0.2693069306930693, | |
| "train_speed(iter/s)": 0.033962 | |
| }, | |
| { | |
| "epoch": 0.22889842632331903, | |
| "grad_norm": 0.5848721861839294, | |
| "learning_rate": 4.5448234019167945e-05, | |
| "loss": 5.021652698516846, | |
| "memory(GiB)": 22.25, | |
| "step": 10, | |
| "token_acc": 0.32229965156794427, | |
| "train_speed(iter/s)": 0.034354 | |
| }, | |
| { | |
| "epoch": 0.25178826895565093, | |
| "grad_norm": 0.4369657635688782, | |
| "learning_rate": 4.428722949554857e-05, | |
| "loss": 5.207980155944824, | |
| "memory(GiB)": 22.25, | |
| "step": 11, | |
| "token_acc": 0.34467455621301774, | |
| "train_speed(iter/s)": 0.034653 | |
| }, | |
| { | |
| "epoch": 0.27467811158798283, | |
| "grad_norm": 0.7269682884216309, | |
| "learning_rate": 4.301303984001967e-05, | |
| "loss": 5.160121917724609, | |
| "memory(GiB)": 22.25, | |
| "step": 12, | |
| "token_acc": 0.34941763727121466, | |
| "train_speed(iter/s)": 0.034904 | |
| }, | |
| { | |
| "epoch": 0.29756795422031473, | |
| "grad_norm": 0.829106867313385, | |
| "learning_rate": 4.163314250413913e-05, | |
| "loss": 4.662051200866699, | |
| "memory(GiB)": 22.25, | |
| "step": 13, | |
| "token_acc": 0.32751091703056767, | |
| "train_speed(iter/s)": 0.035138 | |
| }, | |
| { | |
| "epoch": 0.32045779685264664, | |
| "grad_norm": 1.1529988050460815, | |
| "learning_rate": 4.015563527416595e-05, | |
| "loss": 5.173630237579346, | |
| "memory(GiB)": 22.25, | |
| "step": 14, | |
| "token_acc": 0.28865979381443296, | |
| "train_speed(iter/s)": 0.035337 | |
| }, | |
| { | |
| "epoch": 0.34334763948497854, | |
| "grad_norm": 0.7239392995834351, | |
| "learning_rate": 3.858918875003053e-05, | |
| "loss": 4.88520622253418, | |
| "memory(GiB)": 22.25, | |
| "step": 15, | |
| "token_acc": 0.332089552238806, | |
| "train_speed(iter/s)": 0.035497 | |
| }, | |
| { | |
| "epoch": 0.36623748211731044, | |
| "grad_norm": 0.5255656838417053, | |
| "learning_rate": 3.694299546280657e-05, | |
| "loss": 4.789463043212891, | |
| "memory(GiB)": 22.25, | |
| "step": 16, | |
| "token_acc": 0.36082474226804123, | |
| "train_speed(iter/s)": 0.035644 | |
| }, | |
| { | |
| "epoch": 0.38912732474964234, | |
| "grad_norm": 0.527284562587738, | |
| "learning_rate": 3.5226715929283506e-05, | |
| "loss": 5.008277416229248, | |
| "memory(GiB)": 22.25, | |
| "step": 17, | |
| "token_acc": 0.3034188034188034, | |
| "train_speed(iter/s)": 0.035798 | |
| }, | |
| { | |
| "epoch": 0.41201716738197425, | |
| "grad_norm": 0.6423527002334595, | |
| "learning_rate": 3.3450421960212566e-05, | |
| "loss": 4.778470039367676, | |
| "memory(GiB)": 22.25, | |
| "step": 18, | |
| "token_acc": 0.3488372093023256, | |
| "train_speed(iter/s)": 0.035907 | |
| }, | |
| { | |
| "epoch": 0.43490701001430615, | |
| "grad_norm": 0.4906652867794037, | |
| "learning_rate": 3.162453755491655e-05, | |
| "loss": 4.682660102844238, | |
| "memory(GiB)": 22.25, | |
| "step": 19, | |
| "token_acc": 0.35555555555555557, | |
| "train_speed(iter/s)": 0.036025 | |
| }, | |
| { | |
| "epoch": 0.45779685264663805, | |
| "grad_norm": 0.9560534358024597, | |
| "learning_rate": 2.975977772911671e-05, | |
| "loss": 4.940546989440918, | |
| "memory(GiB)": 22.25, | |
| "step": 20, | |
| "token_acc": 0.36923076923076925, | |
| "train_speed(iter/s)": 0.036093 | |
| }, | |
| { | |
| "epoch": 0.48068669527896996, | |
| "grad_norm": 0.5544789433479309, | |
| "learning_rate": 2.7867085634960016e-05, | |
| "loss": 4.366146087646484, | |
| "memory(GiB)": 22.25, | |
| "step": 21, | |
| "token_acc": 0.3649906890130354, | |
| "train_speed(iter/s)": 0.036168 | |
| }, | |
| { | |
| "epoch": 0.5035765379113019, | |
| "grad_norm": 0.4951302111148834, | |
| "learning_rate": 2.595756834225089e-05, | |
| "loss": 4.866259574890137, | |
| "memory(GiB)": 22.25, | |
| "step": 22, | |
| "token_acc": 0.34402852049910876, | |
| "train_speed(iter/s)": 0.036268 | |
| }, | |
| { | |
| "epoch": 0.5264663805436338, | |
| "grad_norm": 1.56654953956604, | |
| "learning_rate": 2.4042431657749117e-05, | |
| "loss": 4.790994644165039, | |
| "memory(GiB)": 22.25, | |
| "step": 23, | |
| "token_acc": 0.3361522198731501, | |
| "train_speed(iter/s)": 0.03635 | |
| }, | |
| { | |
| "epoch": 0.5493562231759657, | |
| "grad_norm": 0.529353678226471, | |
| "learning_rate": 2.2132914365039993e-05, | |
| "loss": 4.498373985290527, | |
| "memory(GiB)": 22.25, | |
| "step": 24, | |
| "token_acc": 0.38278388278388276, | |
| "train_speed(iter/s)": 0.036412 | |
| }, | |
| { | |
| "epoch": 0.5722460658082976, | |
| "grad_norm": 0.5923216342926025, | |
| "learning_rate": 2.0240222270883288e-05, | |
| "loss": 4.431886672973633, | |
| "memory(GiB)": 22.25, | |
| "step": 25, | |
| "token_acc": 0.3901345291479821, | |
| "train_speed(iter/s)": 0.036468 | |
| }, | |
| { | |
| "epoch": 0.5951359084406295, | |
| "grad_norm": 0.5044678449630737, | |
| "learning_rate": 1.8375462445083464e-05, | |
| "loss": 4.577709674835205, | |
| "memory(GiB)": 22.25, | |
| "step": 26, | |
| "token_acc": 0.3509803921568627, | |
| "train_speed(iter/s)": 0.036523 | |
| }, | |
| { | |
| "epoch": 0.6180257510729614, | |
| "grad_norm": 0.8515617251396179, | |
| "learning_rate": 1.6549578039787436e-05, | |
| "loss": 3.797635555267334, | |
| "memory(GiB)": 22.25, | |
| "step": 27, | |
| "token_acc": 0.40134907251264756, | |
| "train_speed(iter/s)": 0.036566 | |
| }, | |
| { | |
| "epoch": 0.6409155937052933, | |
| "grad_norm": 0.9012308120727539, | |
| "learning_rate": 1.4773284070716503e-05, | |
| "loss": 4.415590286254883, | |
| "memory(GiB)": 22.25, | |
| "step": 28, | |
| "token_acc": 0.38589981447124305, | |
| "train_speed(iter/s)": 0.036597 | |
| }, | |
| { | |
| "epoch": 0.6638054363376252, | |
| "grad_norm": 0.5051128268241882, | |
| "learning_rate": 1.3057004537193423e-05, | |
| "loss": 4.514218330383301, | |
| "memory(GiB)": 22.25, | |
| "step": 29, | |
| "token_acc": 0.3765541740674956, | |
| "train_speed(iter/s)": 0.036643 | |
| }, | |
| { | |
| "epoch": 0.6866952789699571, | |
| "grad_norm": 0.8118892908096313, | |
| "learning_rate": 1.1410811249969475e-05, | |
| "loss": 4.161840915679932, | |
| "memory(GiB)": 22.25, | |
| "step": 30, | |
| "token_acc": 0.35412474849094566, | |
| "train_speed(iter/s)": 0.036683 | |
| }, | |
| { | |
| "epoch": 0.709585121602289, | |
| "grad_norm": 0.7509729266166687, | |
| "learning_rate": 9.844364725834057e-06, | |
| "loss": 4.108524799346924, | |
| "memory(GiB)": 22.25, | |
| "step": 31, | |
| "token_acc": 0.4240924092409241, | |
| "train_speed(iter/s)": 0.036725 | |
| }, | |
| { | |
| "epoch": 0.7324749642346209, | |
| "grad_norm": 0.6745265126228333, | |
| "learning_rate": 8.36685749586087e-06, | |
| "loss": 4.507699489593506, | |
| "memory(GiB)": 22.25, | |
| "step": 32, | |
| "token_acc": 0.35660377358490564, | |
| "train_speed(iter/s)": 0.036768 | |
| }, | |
| { | |
| "epoch": 0.7553648068669528, | |
| "grad_norm": 0.5046018958091736, | |
| "learning_rate": 6.986960159980327e-06, | |
| "loss": 4.469419479370117, | |
| "memory(GiB)": 22.25, | |
| "step": 33, | |
| "token_acc": 0.41550387596899224, | |
| "train_speed(iter/s)": 0.036795 | |
| }, | |
| { | |
| "epoch": 0.7782546494992847, | |
| "grad_norm": 0.6278886198997498, | |
| "learning_rate": 5.712770504451426e-06, | |
| "loss": 4.4875640869140625, | |
| "memory(GiB)": 22.25, | |
| "step": 34, | |
| "token_acc": 0.386411889596603, | |
| "train_speed(iter/s)": 0.036831 | |
| }, | |
| { | |
| "epoch": 0.8011444921316166, | |
| "grad_norm": 1.2817845344543457, | |
| "learning_rate": 4.551765980832059e-06, | |
| "loss": 4.035043239593506, | |
| "memory(GiB)": 22.25, | |
| "step": 35, | |
| "token_acc": 0.39222042139384117, | |
| "train_speed(iter/s)": 0.036858 | |
| }, | |
| { | |
| "epoch": 0.8240343347639485, | |
| "grad_norm": 0.6294739246368408, | |
| "learning_rate": 3.5107598253199758e-06, | |
| "loss": 3.905367612838745, | |
| "memory(GiB)": 22.25, | |
| "step": 36, | |
| "token_acc": 0.4039301310043668, | |
| "train_speed(iter/s)": 0.036893 | |
| }, | |
| { | |
| "epoch": 0.8469241773962805, | |
| "grad_norm": 0.5308797359466553, | |
| "learning_rate": 2.595861075973613e-06, | |
| "loss": 3.832357883453369, | |
| "memory(GiB)": 22.25, | |
| "step": 37, | |
| "token_acc": 0.37555555555555553, | |
| "train_speed(iter/s)": 0.03692 | |
| }, | |
| { | |
| "epoch": 0.8698140200286123, | |
| "grad_norm": 0.613280177116394, | |
| "learning_rate": 1.8124387224476347e-06, | |
| "loss": 3.510023832321167, | |
| "memory(GiB)": 22.25, | |
| "step": 38, | |
| "token_acc": 0.41849529780564265, | |
| "train_speed(iter/s)": 0.036949 | |
| }, | |
| { | |
| "epoch": 0.8927038626609443, | |
| "grad_norm": 0.5897545218467712, | |
| "learning_rate": 1.1650901986267365e-06, | |
| "loss": 4.297924041748047, | |
| "memory(GiB)": 22.25, | |
| "step": 39, | |
| "token_acc": 0.38562091503267976, | |
| "train_speed(iter/s)": 0.036961 | |
| }, | |
| { | |
| "epoch": 0.9155937052932761, | |
| "grad_norm": 0.5033223032951355, | |
| "learning_rate": 6.576144030555259e-07, | |
| "loss": 3.912318229675293, | |
| "memory(GiB)": 22.25, | |
| "step": 40, | |
| "token_acc": 0.3920792079207921, | |
| "train_speed(iter/s)": 0.036991 | |
| }, | |
| { | |
| "epoch": 0.9384835479256081, | |
| "grad_norm": 0.44826453924179077, | |
| "learning_rate": 2.9298940549128964e-07, | |
| "loss": 3.7790920734405518, | |
| "memory(GiB)": 22.25, | |
| "step": 41, | |
| "token_acc": 0.4283464566929134, | |
| "train_speed(iter/s)": 0.03701 | |
| }, | |
| { | |
| "epoch": 0.9613733905579399, | |
| "grad_norm": 0.8731946349143982, | |
| "learning_rate": 7.335497040648898e-08, | |
| "loss": 4.0045576095581055, | |
| "memory(GiB)": 22.25, | |
| "step": 42, | |
| "token_acc": 0.3923076923076923, | |
| "train_speed(iter/s)": 0.03704 | |
| }, | |
| { | |
| "epoch": 0.9842632331902719, | |
| "grad_norm": 1.097395420074463, | |
| "learning_rate": 0.0, | |
| "loss": 4.415482521057129, | |
| "memory(GiB)": 22.25, | |
| "step": 43, | |
| "token_acc": 0.4146341463414634, | |
| "train_speed(iter/s)": 0.037059 | |
| }, | |
| { | |
| "epoch": 0.9842632331902719, | |
| "eval_loss": 3.7959742546081543, | |
| "eval_runtime": 29.3121, | |
| "eval_samples_per_second": 9.996, | |
| "eval_steps_per_second": 1.262, | |
| "eval_token_acc": 0.4216255442670537, | |
| "step": 43 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 43, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.042062092776243e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |