diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.22148394241417496, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 73.265625, + "epoch": 0.0011074197120708748, + "grad_norm": 0.47520893812179565, + "kl": 0.0, + "learning_rate": 9.99375e-07, + "loss": 0.000854941550642252, + "reward": 2.2648561000823975, + "reward_std": 0.32521533221006393, + "rewards/GDino": 0.84943026304245, + "rewards/GIT": 0.5776679813861847, + "rewards/HPSv2": 0.2639656066894531, + "rewards/ORM": 0.5737921893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 1 + }, + { + "completion_length": 57.359375, + "epoch": 0.0022148394241417496, + "grad_norm": 0.7006784677505493, + "kl": 0.00151824951171875, + "learning_rate": 9.9875e-07, + "loss": 0.0010380030144006014, + "reward": 1.6890186071395874, + "reward_std": 0.5064275413751602, + "rewards/GDino": 0.7000000476837158, + "rewards/GIT": 0.161313958466053, + "rewards/HPSv2": 0.2509632110595703, + "rewards/ORM": 0.5767413973808289, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.5625, + "step": 2 + }, + { + "completion_length": 54.640625, + "epoch": 0.0033222591362126247, + "grad_norm": 0.5812113285064697, + "kl": 0.001556396484375, + "learning_rate": 9.98125e-07, + "loss": -0.0055133504793047905, + "reward": 1.5832943320274353, + "reward_std": 0.3882431983947754, + "rewards/GDino": 0.6165956258773804, + "rewards/GIT": 0.3970412313938141, + "rewards/HPSv2": 0.24474143981933594, + "rewards/ORM": 0.3249160535633564, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -20.8125, + "step": 3 + }, + { + "completion_length": 63.578125, + "epoch": 0.004429678848283499, + "grad_norm": 0.6130731105804443, + "kl": 0.001605987548828125, + "learning_rate": 9.975e-07, + "loss": -0.005623435601592064, + "reward": 2.1563462018966675, + "reward_std": 0.3505118489265442, + "rewards/GDino": 0.8188963234424591, + "rewards/GIT": 0.4581628292798996, + "rewards/HPSv2": 0.24955368041992188, + "rewards/ORM": 0.6297334432601929, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 4 + }, + { + "completion_length": 57.65625, + "epoch": 0.005537098560354375, + "grad_norm": 0.8068524599075317, + "kl": 0.00165557861328125, + "learning_rate": 9.968749999999999e-07, + "loss": -0.0018901200965046883, + "reward": 1.6294466853141785, + "reward_std": 0.3914882242679596, + "rewards/GDino": 0.6075743436813354, + "rewards/GIT": 0.2503758817911148, + "rewards/HPSv2": 0.2523918151855469, + "rewards/ORM": 0.5191046893596649, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.8125, + "step": 5 + }, + { + "completion_length": 65.8125, + "epoch": 0.006644518272425249, + "grad_norm": 74728.3515625, + "kl": 228.00085067749023, + "learning_rate": 9.9625e-07, + "loss": 2.2879227567464113, + "reward": 2.15460866689682, + "reward_std": 0.18937285244464874, + "rewards/GDino": 0.7502027153968811, + "rewards/GIT": 0.4551280438899994, + "rewards/HPSv2": 0.2774028778076172, + "rewards/ORM": 0.671875, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.625, + "step": 6 + }, + { + "completion_length": 65.640625, + "epoch": 0.007751937984496124, + "grad_norm": 0.9850716590881348, + "kl": 0.001739501953125, + "learning_rate": 9.956249999999999e-07, + "loss": -0.009785129223018885, + "reward": 1.6486687660217285, + "reward_std": 0.55589759349823, + "rewards/GDino": 0.5765624940395355, + "rewards/GIT": 0.15754839032888412, + "rewards/HPSv2": 0.2522296905517578, + "rewards/ORM": 0.6623281538486481, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.1875, + "step": 7 + }, + { + "completion_length": 65.796875, + "epoch": 0.008859357696566999, + "grad_norm": 0.8074976801872253, + "kl": 0.001628875732421875, + "learning_rate": 9.95e-07, + "loss": 0.0002866658614948392, + "reward": 1.7531355023384094, + "reward_std": 0.3834189176559448, + "rewards/GDino": 0.7171875536441803, + "rewards/GIT": 0.3904750794172287, + "rewards/HPSv2": 0.2441272735595703, + "rewards/ORM": 0.4013456404209137, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.5, + "step": 8 + }, + { + "completion_length": 61.53125, + "epoch": 0.009966777408637873, + "grad_norm": 0.5135362148284912, + "kl": 0.001628875732421875, + "learning_rate": 9.94375e-07, + "loss": -0.002820038120262325, + "reward": 2.1886491775512695, + "reward_std": 0.5042529106140137, + "rewards/GDino": 0.800000011920929, + "rewards/GIT": 0.3224633187055588, + "rewards/HPSv2": 0.2661018371582031, + "rewards/ORM": 0.8000838756561279, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 9 + }, + { + "completion_length": 66.9375, + "epoch": 0.01107419712070875, + "grad_norm": 1.035406231880188, + "kl": 0.001590728759765625, + "learning_rate": 9.9375e-07, + "loss": 0.010037540923804045, + "reward": 1.8388126492500305, + "reward_std": 0.385573148727417, + "rewards/GDino": 0.729426920413971, + "rewards/GIT": 0.47063055634498596, + "rewards/HPSv2": 0.25093841552734375, + "rewards/ORM": 0.3878167122602463, + "self_certainty_semantic": -25.375, + "self_certainty_token": -20.75, + "step": 10 + }, + { + "completion_length": 54.65625, + "epoch": 0.012181616832779624, + "grad_norm": 0.6659172773361206, + "kl": 0.00159454345703125, + "learning_rate": 9.93125e-07, + "loss": -0.010986692272126675, + "reward": 2.312160015106201, + "reward_std": 0.3424924612045288, + "rewards/GDino": 0.7864583432674408, + "rewards/GIT": 0.5519254580140114, + "rewards/HPSv2": 0.2634601593017578, + "rewards/ORM": 0.710316002368927, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.9375, + "step": 11 + }, + { + "completion_length": 65.6875, + "epoch": 0.013289036544850499, + "grad_norm": 0.4100457727909088, + "kl": 0.00152587890625, + "learning_rate": 9.925e-07, + "loss": -0.0020649502985179424, + "reward": 1.831676721572876, + "reward_std": 0.37266574054956436, + "rewards/GDino": 0.6748343408107758, + "rewards/GIT": 0.3966377377510071, + "rewards/HPSv2": 0.2431049346923828, + "rewards/ORM": 0.5170995742082596, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5, + "step": 12 + }, + { + "completion_length": 62.15625, + "epoch": 0.014396456256921373, + "grad_norm": 1.1354421377182007, + "kl": 0.0016326904296875, + "learning_rate": 9.91875e-07, + "loss": -0.0013978920178487897, + "reward": 1.7478299736976624, + "reward_std": 0.3111024349927902, + "rewards/GDino": 0.7122170925140381, + "rewards/GIT": 0.28808362782001495, + "rewards/HPSv2": 0.2510089874267578, + "rewards/ORM": 0.4965202957391739, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.6875, + "step": 13 + }, + { + "completion_length": 63.734375, + "epoch": 0.015503875968992248, + "grad_norm": 171.63954162597656, + "kl": 11.750831604003906, + "learning_rate": 9.912499999999998e-07, + "loss": 0.11320369923487306, + "reward": 1.820958137512207, + "reward_std": 0.6430586874485016, + "rewards/GDino": 0.7286913394927979, + "rewards/GIT": 0.39159613847732544, + "rewards/HPSv2": 0.222503662109375, + "rewards/ORM": 0.47816696763038635, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.875, + "step": 14 + }, + { + "completion_length": 64.796875, + "epoch": 0.016611295681063124, + "grad_norm": 1.790418267250061, + "kl": 0.001697540283203125, + "learning_rate": 9.90625e-07, + "loss": -0.0012796747614629567, + "reward": 2.4724700450897217, + "reward_std": 0.361017182469368, + "rewards/GDino": 0.8982033133506775, + "rewards/GIT": 0.5411243438720703, + "rewards/HPSv2": 0.2581005096435547, + "rewards/ORM": 0.7750419676303864, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.8125, + "step": 15 + }, + { + "completion_length": 65.078125, + "epoch": 0.017718715393133997, + "grad_norm": 0.38361120223999023, + "kl": 0.0015869140625, + "learning_rate": 9.9e-07, + "loss": 0.006866331794299185, + "reward": 1.5055131912231445, + "reward_std": 0.40322621166706085, + "rewards/GDino": 0.651562511920929, + "rewards/GIT": 0.2843637466430664, + "rewards/HPSv2": 0.24664592742919922, + "rewards/ORM": 0.32294100522994995, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 16 + }, + { + "completion_length": 70.25, + "epoch": 0.018826135105204873, + "grad_norm": 1.0185045003890991, + "kl": 0.001552581787109375, + "learning_rate": 9.89375e-07, + "loss": -0.010323233203962445, + "reward": 1.5897727608680725, + "reward_std": 0.530043363571167, + "rewards/GDino": 0.5529386103153229, + "rewards/GIT": 0.2131059616804123, + "rewards/HPSv2": 0.2552909851074219, + "rewards/ORM": 0.5684372782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -20.5, + "step": 17 + }, + { + "completion_length": 66.34375, + "epoch": 0.019933554817275746, + "grad_norm": 0.4375481605529785, + "kl": 0.00156402587890625, + "learning_rate": 9.8875e-07, + "loss": -0.00136462040245533, + "reward": 2.063610315322876, + "reward_std": 0.42642320692539215, + "rewards/GDino": 0.7955474257469177, + "rewards/GIT": 0.5150393098592758, + "rewards/HPSv2": 0.22445201873779297, + "rewards/ORM": 0.528571605682373, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.75, + "step": 18 + }, + { + "completion_length": 59.21875, + "epoch": 0.021040974529346623, + "grad_norm": 0.3959902226924896, + "kl": 0.00164031982421875, + "learning_rate": 9.88125e-07, + "loss": -0.0053134458139538765, + "reward": 1.5237417221069336, + "reward_std": 0.4693976193666458, + "rewards/GDino": 0.701702356338501, + "rewards/GIT": 0.2579326629638672, + "rewards/HPSv2": 0.24812698364257812, + "rewards/ORM": 0.3159796893596649, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 19 + }, + { + "completion_length": 61.484375, + "epoch": 0.0221483942414175, + "grad_norm": 0.5081169605255127, + "kl": 0.001689910888671875, + "learning_rate": 9.875e-07, + "loss": 0.0003520832397043705, + "reward": 1.9516127109527588, + "reward_std": 0.2731045335531235, + "rewards/GDino": 0.6437798738479614, + "rewards/GIT": 0.4635310173034668, + "rewards/HPSv2": 0.24121475219726562, + "rewards/ORM": 0.6030870825052261, + "self_certainty_semantic": -25.1875, + "self_certainty_token": -21.5625, + "step": 20 + }, + { + "completion_length": 55.546875, + "epoch": 0.023255813953488372, + "grad_norm": 0.4565694034099579, + "kl": 0.001667022705078125, + "learning_rate": 9.86875e-07, + "loss": 0.0016932454891502857, + "reward": 2.180082321166992, + "reward_std": 0.5037369430065155, + "rewards/GDino": 0.7953125238418579, + "rewards/GIT": 0.45517681539058685, + "rewards/HPSv2": 0.2586212158203125, + "rewards/ORM": 0.6709719300270081, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 21 + }, + { + "completion_length": 68.75, + "epoch": 0.024363233665559248, + "grad_norm": 0.45827633142471313, + "kl": 0.001712799072265625, + "learning_rate": 9.862499999999999e-07, + "loss": 0.0007174527272582054, + "reward": 1.8721013069152832, + "reward_std": 0.4303991347551346, + "rewards/GDino": 0.6911458671092987, + "rewards/GIT": 0.36048486828804016, + "rewards/HPSv2": 0.2603263854980469, + "rewards/ORM": 0.5601442009210587, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.4375, + "step": 22 + }, + { + "completion_length": 58.609375, + "epoch": 0.02547065337763012, + "grad_norm": 0.6875389218330383, + "kl": 0.00162506103515625, + "learning_rate": 9.85625e-07, + "loss": -0.004631380317732692, + "reward": 1.9805514812469482, + "reward_std": 0.5138447731733322, + "rewards/GDino": 0.706105500459671, + "rewards/GIT": 0.4199465811252594, + "rewards/HPSv2": 0.26941490173339844, + "rewards/ORM": 0.5850843787193298, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 23 + }, + { + "completion_length": 60.859375, + "epoch": 0.026578073089700997, + "grad_norm": 0.5052416324615479, + "kl": 0.001667022705078125, + "learning_rate": 9.849999999999999e-07, + "loss": -0.0046843914315104485, + "reward": 2.368114173412323, + "reward_std": 0.4367552697658539, + "rewards/GDino": 0.815625011920929, + "rewards/GIT": 0.633857935667038, + "rewards/HPSv2": 0.25930213928222656, + "rewards/ORM": 0.6593290567398071, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.875, + "step": 24 + }, + { + "completion_length": 61.078125, + "epoch": 0.02768549280177187, + "grad_norm": 0.6162320971488953, + "kl": 0.001617431640625, + "learning_rate": 9.84375e-07, + "loss": -0.005464642075821757, + "reward": 1.9494624137878418, + "reward_std": 0.40468768775463104, + "rewards/GDino": 0.6967671811580658, + "rewards/GIT": 0.40975040197372437, + "rewards/HPSv2": 0.26043701171875, + "rewards/ORM": 0.5825077295303345, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 25 + }, + { + "completion_length": 50.734375, + "epoch": 0.028792912513842746, + "grad_norm": 2.8454437255859375, + "kl": 0.001804351806640625, + "learning_rate": 9.8375e-07, + "loss": -0.006305628921836615, + "reward": 2.190965175628662, + "reward_std": 0.44982025027275085, + "rewards/GDino": 0.7243013381958008, + "rewards/GIT": 0.5294483602046967, + "rewards/HPSv2": 0.2750282287597656, + "rewards/ORM": 0.6621872782707214, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.375, + "step": 26 + }, + { + "completion_length": 62.484375, + "epoch": 0.029900332225913623, + "grad_norm": 0.4033506512641907, + "kl": 0.0016021728515625, + "learning_rate": 9.83125e-07, + "loss": -0.0016465974040329456, + "reward": 1.9733637571334839, + "reward_std": 0.44280076026916504, + "rewards/GDino": 0.7363362908363342, + "rewards/GIT": 0.4528593420982361, + "rewards/HPSv2": 0.24550628662109375, + "rewards/ORM": 0.5386618673801422, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.375, + "step": 27 + }, + { + "completion_length": 65.046875, + "epoch": 0.031007751937984496, + "grad_norm": 0.559298574924469, + "kl": 0.00167083740234375, + "learning_rate": 9.825e-07, + "loss": 0.004501585033722222, + "reward": 1.4280173778533936, + "reward_std": 0.27060839533805847, + "rewards/GDino": 0.5987553596496582, + "rewards/GIT": 0.10973574221134186, + "rewards/HPSv2": 0.2664012908935547, + "rewards/ORM": 0.453125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.9375, + "step": 28 + }, + { + "completion_length": 55.5625, + "epoch": 0.03211517165005537, + "grad_norm": 0.42233753204345703, + "kl": 0.00168609619140625, + "learning_rate": 9.81875e-07, + "loss": -0.005473613273352385, + "reward": 2.4506709575653076, + "reward_std": 0.20222720131278038, + "rewards/GDino": 0.8296874761581421, + "rewards/GIT": 0.605083167552948, + "rewards/HPSv2": 0.285858154296875, + "rewards/ORM": 0.7300421893596649, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.9375, + "step": 29 + }, + { + "completion_length": 57.640625, + "epoch": 0.03322259136212625, + "grad_norm": 0.5650274157524109, + "kl": 0.0016326904296875, + "learning_rate": 9.8125e-07, + "loss": 0.0003150699194520712, + "reward": 2.489137649536133, + "reward_std": 0.4210814982652664, + "rewards/GDino": 0.8948009014129639, + "rewards/GIT": 0.586266428232193, + "rewards/HPSv2": 0.24865341186523438, + "rewards/ORM": 0.7594169676303864, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.0625, + "step": 30 + }, + { + "completion_length": 78.78125, + "epoch": 0.03433001107419712, + "grad_norm": 0.6762183308601379, + "kl": 0.001613616943359375, + "learning_rate": 9.806249999999998e-07, + "loss": 0.007568572706077248, + "reward": 1.8555968403816223, + "reward_std": 0.2906922847032547, + "rewards/GDino": 0.5989583432674408, + "rewards/GIT": 0.38505683839321136, + "rewards/HPSv2": 0.2403736114501953, + "rewards/ORM": 0.6312080323696136, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.5625, + "step": 31 + }, + { + "completion_length": 62.5, + "epoch": 0.035437430786267994, + "grad_norm": 0.4184902012348175, + "kl": 0.001628875732421875, + "learning_rate": 9.8e-07, + "loss": 0.007896744413301349, + "reward": 1.495099127292633, + "reward_std": 0.3622882664203644, + "rewards/GDino": 0.6791666448116302, + "rewards/GIT": 0.25104063749313354, + "rewards/HPSv2": 0.23050880432128906, + "rewards/ORM": 0.3343829959630966, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -22.0625, + "step": 32 + }, + { + "completion_length": 70.109375, + "epoch": 0.036544850498338874, + "grad_norm": 0.47143352031707764, + "kl": 0.0016937255859375, + "learning_rate": 9.79375e-07, + "loss": 0.00709247519262135, + "reward": 2.3964842557907104, + "reward_std": 0.5415211468935013, + "rewards/GDino": 0.897656261920929, + "rewards/GIT": 0.6205766499042511, + "rewards/HPSv2": 0.2254810333251953, + "rewards/ORM": 0.6527703106403351, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.625, + "step": 33 + }, + { + "completion_length": 55.53125, + "epoch": 0.03765227021040975, + "grad_norm": 0.45762747526168823, + "kl": 0.001678466796875, + "learning_rate": 9.7875e-07, + "loss": 0.020488019566982985, + "reward": 1.9143174886703491, + "reward_std": 0.2841227799654007, + "rewards/GDino": 0.6593749821186066, + "rewards/GIT": 0.4214262217283249, + "rewards/HPSv2": 0.2424945831298828, + "rewards/ORM": 0.5910216569900513, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.75, + "step": 34 + }, + { + "completion_length": 55.828125, + "epoch": 0.03875968992248062, + "grad_norm": 0.3845841884613037, + "kl": 0.00167083740234375, + "learning_rate": 9.78125e-07, + "loss": 0.01862273830920458, + "reward": 2.274049997329712, + "reward_std": 0.28603486716747284, + "rewards/GDino": 0.7786458432674408, + "rewards/GIT": 0.5405041128396988, + "rewards/HPSv2": 0.23740386962890625, + "rewards/ORM": 0.7174962311983109, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 35 + }, + { + "completion_length": 63.234375, + "epoch": 0.03986710963455149, + "grad_norm": 0.5729533433914185, + "kl": 0.001678466796875, + "learning_rate": 9.775e-07, + "loss": -0.002963901497423649, + "reward": 1.8639960289001465, + "reward_std": 0.3890039473772049, + "rewards/GDino": 0.6255208253860474, + "rewards/GIT": 0.42713797092437744, + "rewards/HPSv2": 0.24535751342773438, + "rewards/ORM": 0.5659796744585037, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 36 + }, + { + "completion_length": 63.09375, + "epoch": 0.04097452934662237, + "grad_norm": 0.47338196635246277, + "kl": 0.001888275146484375, + "learning_rate": 9.76875e-07, + "loss": 0.008916446007788181, + "reward": 1.9735829830169678, + "reward_std": 0.5416238605976105, + "rewards/GDino": 0.7008762061595917, + "rewards/GIT": 0.3141380175948143, + "rewards/HPSv2": 0.2595968246459961, + "rewards/ORM": 0.6989719867706299, + "self_certainty_semantic": -25.375, + "self_certainty_token": -23.125, + "step": 37 + }, + { + "completion_length": 58.640625, + "epoch": 0.042081949058693245, + "grad_norm": 1.639336347579956, + "kl": 0.001651763916015625, + "learning_rate": 9.7625e-07, + "loss": -0.0003745388239622116, + "reward": 1.8843677639961243, + "reward_std": 0.27646802365779877, + "rewards/GDino": 0.7309310734272003, + "rewards/GIT": 0.2879854440689087, + "rewards/HPSv2": 0.25732994079589844, + "rewards/ORM": 0.6081212311983109, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0625, + "step": 38 + }, + { + "completion_length": 54.453125, + "epoch": 0.04318936877076412, + "grad_norm": 0.4438176453113556, + "kl": 0.00176239013671875, + "learning_rate": 9.756249999999999e-07, + "loss": -0.004410726949572563, + "reward": 2.3740460872650146, + "reward_std": 0.26216618716716766, + "rewards/GDino": 0.8794216811656952, + "rewards/GIT": 0.480433389544487, + "rewards/HPSv2": 0.2703990936279297, + "rewards/ORM": 0.7437919676303864, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 39 + }, + { + "completion_length": 64.65625, + "epoch": 0.044296788482835, + "grad_norm": 0.9789016246795654, + "kl": 0.0017242431640625, + "learning_rate": 9.75e-07, + "loss": -0.0008055282523855567, + "reward": 2.2535433769226074, + "reward_std": 0.46909773349761963, + "rewards/GDino": 0.8751652538776398, + "rewards/GIT": 0.4070926010608673, + "rewards/HPSv2": 0.2731647491455078, + "rewards/ORM": 0.6981207877397537, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.625, + "step": 40 + }, + { + "completion_length": 60.3125, + "epoch": 0.04540420819490587, + "grad_norm": 0.39339736104011536, + "kl": 0.001697540283203125, + "learning_rate": 9.743749999999999e-07, + "loss": -0.0026839073980227113, + "reward": 1.926289677619934, + "reward_std": 0.21494604647159576, + "rewards/GDino": 0.6536072194576263, + "rewards/GIT": 0.38067150115966797, + "rewards/HPSv2": 0.2470531463623047, + "rewards/ORM": 0.6449578106403351, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.0, + "step": 41 + }, + { + "completion_length": 55.734375, + "epoch": 0.046511627906976744, + "grad_norm": 0.43325623869895935, + "kl": 0.001575469970703125, + "learning_rate": 9.7375e-07, + "loss": 0.01566000678576529, + "reward": 2.2492642402648926, + "reward_std": 0.545527771115303, + "rewards/GDino": 0.8451037406921387, + "rewards/GIT": 0.4486817270517349, + "rewards/HPSv2": 0.2523536682128906, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.125, + "step": 42 + }, + { + "completion_length": 77.859375, + "epoch": 0.047619047619047616, + "grad_norm": 0.6008194088935852, + "kl": 0.00209808349609375, + "learning_rate": 9.73125e-07, + "loss": 0.009053934598341584, + "reward": 1.752554178237915, + "reward_std": 0.3711804449558258, + "rewards/GDino": 0.6425288617610931, + "rewards/GIT": 0.38656318187713623, + "rewards/HPSv2": 0.23595809936523438, + "rewards/ORM": 0.4875040054321289, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.8125, + "step": 43 + }, + { + "completion_length": 64.859375, + "epoch": 0.048726467331118496, + "grad_norm": 0.4626310169696808, + "kl": 0.001750946044921875, + "learning_rate": 9.725e-07, + "loss": 0.00038470514118671417, + "reward": 2.837794542312622, + "reward_std": 0.3451881557703018, + "rewards/GDino": 0.9479166865348816, + "rewards/GIT": 0.7795328795909882, + "rewards/HPSv2": 0.26932334899902344, + "rewards/ORM": 0.8410216569900513, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 44 + }, + { + "completion_length": 66.921875, + "epoch": 0.04983388704318937, + "grad_norm": 1.3941670656204224, + "kl": 0.001880645751953125, + "learning_rate": 9.71875e-07, + "loss": -0.012070931028574705, + "reward": 2.561403751373291, + "reward_std": 0.48213036358356476, + "rewards/GDino": 0.9039532244205475, + "rewards/GIT": 0.5467919409275055, + "rewards/HPSv2": 0.2617225646972656, + "rewards/ORM": 0.8489359319210052, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.9375, + "step": 45 + }, + { + "completion_length": 59.625, + "epoch": 0.05094130675526024, + "grad_norm": 0.5365378260612488, + "kl": 0.001949310302734375, + "learning_rate": 9.712499999999998e-07, + "loss": 0.01103684725239873, + "reward": 2.0622146129608154, + "reward_std": 0.40072987973690033, + "rewards/GDino": 0.645312488079071, + "rewards/GIT": 0.33725525438785553, + "rewards/HPSv2": 0.2619609832763672, + "rewards/ORM": 0.8176859617233276, + "self_certainty_semantic": -25.25, + "self_certainty_token": -22.3125, + "step": 46 + }, + { + "completion_length": 64.6875, + "epoch": 0.05204872646733112, + "grad_norm": 0.5151812434196472, + "kl": 0.001766204833984375, + "learning_rate": 9.70625e-07, + "loss": -0.004148014355450869, + "reward": 1.7916635870933533, + "reward_std": 0.31147970259189606, + "rewards/GDino": 0.7293796539306641, + "rewards/GIT": 0.20818163454532623, + "rewards/HPSv2": 0.27945709228515625, + "rewards/ORM": 0.5746453106403351, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.5625, + "step": 47 + }, + { + "completion_length": 56.25, + "epoch": 0.053156146179401995, + "grad_norm": 0.7559373378753662, + "kl": 0.001861572265625, + "learning_rate": 9.7e-07, + "loss": -0.002030523493885994, + "reward": 1.4302473068237305, + "reward_std": 0.4484506845474243, + "rewards/GDino": 0.6244329512119293, + "rewards/GIT": 0.0, + "rewards/HPSv2": 0.2752876281738281, + "rewards/ORM": 0.5305267572402954, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.875, + "step": 48 + }, + { + "completion_length": 61.21875, + "epoch": 0.05426356589147287, + "grad_norm": 0.46310731768608093, + "kl": 0.00177764892578125, + "learning_rate": 9.69375e-07, + "loss": 0.0054672048427164555, + "reward": 1.9361683130264282, + "reward_std": 0.3801421523094177, + "rewards/GDino": 0.7904821038246155, + "rewards/GIT": 0.2458050437271595, + "rewards/HPSv2": 0.25890541076660156, + "rewards/ORM": 0.640975683927536, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.6875, + "step": 49 + }, + { + "completion_length": 61.921875, + "epoch": 0.05537098560354374, + "grad_norm": 0.5111473798751831, + "kl": 0.002353668212890625, + "learning_rate": 9.6875e-07, + "loss": 0.0035089042503386736, + "reward": 2.212684750556946, + "reward_std": 0.3874351307749748, + "rewards/GDino": 0.7840971350669861, + "rewards/GIT": 0.42198260873556137, + "rewards/HPSv2": 0.25807952880859375, + "rewards/ORM": 0.7485254108905792, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.625, + "step": 50 + }, + { + "completion_length": 57.796875, + "epoch": 0.05647840531561462, + "grad_norm": 0.4804292917251587, + "kl": 0.001743316650390625, + "learning_rate": 9.68125e-07, + "loss": -0.0010273723164573312, + "reward": 1.8951371908187866, + "reward_std": 0.5679852366447449, + "rewards/GDino": 0.7922006845474243, + "rewards/GIT": 0.27185457944869995, + "rewards/HPSv2": 0.2777671813964844, + "rewards/ORM": 0.5533146858215332, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0, + "step": 51 + }, + { + "completion_length": 62.140625, + "epoch": 0.05758582502768549, + "grad_norm": 0.5876587629318237, + "kl": 0.001842498779296875, + "learning_rate": 9.675e-07, + "loss": 0.010319232940673828, + "reward": 2.453005313873291, + "reward_std": 0.35728050768375397, + "rewards/GDino": 0.917187511920929, + "rewards/GIT": 0.6651300191879272, + "rewards/HPSv2": 0.27350807189941406, + "rewards/ORM": 0.5971797704696655, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0625, + "step": 52 + }, + { + "completion_length": 57.046875, + "epoch": 0.058693244739756366, + "grad_norm": 0.5244357585906982, + "kl": 0.00168609619140625, + "learning_rate": 9.66875e-07, + "loss": 0.0012504801852628589, + "reward": 1.8911731839179993, + "reward_std": 0.3232653737068176, + "rewards/GDino": 0.7297230660915375, + "rewards/GIT": 0.3948078155517578, + "rewards/HPSv2": 0.24039649963378906, + "rewards/ORM": 0.5262457728385925, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 53 + }, + { + "completion_length": 68.921875, + "epoch": 0.059800664451827246, + "grad_norm": 0.5011692047119141, + "kl": 0.0017547607421875, + "learning_rate": 9.6625e-07, + "loss": -0.001990929711610079, + "reward": 1.5346381068229675, + "reward_std": 0.5364750325679779, + "rewards/GDino": 0.5896078050136566, + "rewards/GIT": 0.2611962556838989, + "rewards/HPSv2": 0.24633407592773438, + "rewards/ORM": 0.4375000149011612, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5625, + "step": 54 + }, + { + "completion_length": 65.28125, + "epoch": 0.06090808416389812, + "grad_norm": 0.43720903992652893, + "kl": 0.001796722412109375, + "learning_rate": 9.65625e-07, + "loss": 0.011945425532758236, + "reward": 1.7657405734062195, + "reward_std": 0.5052186846733093, + "rewards/GDino": 0.7055748403072357, + "rewards/GIT": 0.3213713690638542, + "rewards/HPSv2": 0.26223182678222656, + "rewards/ORM": 0.4765625, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.8125, + "step": 55 + }, + { + "completion_length": 72.15625, + "epoch": 0.06201550387596899, + "grad_norm": 0.6576823592185974, + "kl": 0.00201416015625, + "learning_rate": 9.649999999999999e-07, + "loss": 0.010990551207214594, + "reward": 2.0798487663269043, + "reward_std": 0.5881477892398834, + "rewards/GDino": 0.7611979246139526, + "rewards/GIT": 0.38940075039863586, + "rewards/HPSv2": 0.25081634521484375, + "rewards/ORM": 0.678433746099472, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.8125, + "step": 56 + }, + { + "completion_length": 53.84375, + "epoch": 0.06312292358803986, + "grad_norm": 0.5109694600105286, + "kl": 0.001708984375, + "learning_rate": 9.64375e-07, + "loss": -0.009197955019772053, + "reward": 1.825343132019043, + "reward_std": 0.49610868096351624, + "rewards/GDino": 0.7342002689838409, + "rewards/GIT": 0.27930086851119995, + "rewards/HPSv2": 0.2493419647216797, + "rewards/ORM": 0.5625, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5, + "step": 57 + }, + { + "completion_length": 54.671875, + "epoch": 0.06423034330011074, + "grad_norm": 0.48297855257987976, + "kl": 0.0018157958984375, + "learning_rate": 9.637499999999999e-07, + "loss": -2.7031637728214264e-05, + "reward": 1.9436655044555664, + "reward_std": 0.5841460824012756, + "rewards/GDino": 0.7508301734924316, + "rewards/GIT": 0.36742376536130905, + "rewards/HPSv2": 0.24603271484375, + "rewards/ORM": 0.579378753900528, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.6875, + "step": 58 + }, + { + "completion_length": 57.34375, + "epoch": 0.06533776301218161, + "grad_norm": 1.5652471780776978, + "kl": 0.00185394287109375, + "learning_rate": 9.63125e-07, + "loss": -0.0014887296129018068, + "reward": 2.154895305633545, + "reward_std": 0.5548917800188065, + "rewards/GDino": 0.7907229363918304, + "rewards/GIT": 0.44339829683303833, + "rewards/HPSv2": 0.2567615509033203, + "rewards/ORM": 0.664012536406517, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.0625, + "step": 59 + }, + { + "completion_length": 52.0625, + "epoch": 0.0664451827242525, + "grad_norm": 0.8647972941398621, + "kl": 0.00200653076171875, + "learning_rate": 9.624999999999999e-07, + "loss": -0.004864218062721193, + "reward": 2.183086931705475, + "reward_std": 0.27265597879886627, + "rewards/GDino": 0.8968750238418579, + "rewards/GIT": 0.4909053146839142, + "rewards/HPSv2": 0.2511100769042969, + "rewards/ORM": 0.544196605682373, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.8125, + "step": 60 + }, + { + "completion_length": 78.421875, + "epoch": 0.06755260243632337, + "grad_norm": 0.6149311065673828, + "kl": 0.0018310546875, + "learning_rate": 9.61875e-07, + "loss": -0.003399772336706519, + "reward": 2.3938775062561035, + "reward_std": 0.3266971558332443, + "rewards/GDino": 0.7299478650093079, + "rewards/GIT": 0.6572037935256958, + "rewards/HPSv2": 0.26293373107910156, + "rewards/ORM": 0.743791937828064, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.5, + "step": 61 + }, + { + "completion_length": 71.796875, + "epoch": 0.06866002214839424, + "grad_norm": 0.8106938600540161, + "kl": 0.00188446044921875, + "learning_rate": 9.6125e-07, + "loss": -0.004746791877551004, + "reward": 2.3078866004943848, + "reward_std": 0.4594850391149521, + "rewards/GDino": 0.7886728346347809, + "rewards/GIT": 0.6039779186248779, + "rewards/HPSv2": 0.2555561065673828, + "rewards/ORM": 0.6596797406673431, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 62 + }, + { + "completion_length": 57.703125, + "epoch": 0.06976744186046512, + "grad_norm": 0.5699672102928162, + "kl": 0.00218963623046875, + "learning_rate": 9.606249999999998e-07, + "loss": 0.005022911122068763, + "reward": 2.2111340165138245, + "reward_std": 0.6219878196716309, + "rewards/GDino": 0.794545441865921, + "rewards/GIT": 0.45049863308668137, + "rewards/HPSv2": 0.24386024475097656, + "rewards/ORM": 0.7222297191619873, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.9375, + "step": 63 + }, + { + "completion_length": 78.453125, + "epoch": 0.07087486157253599, + "grad_norm": 0.7573527693748474, + "kl": 0.0022125244140625, + "learning_rate": 9.6e-07, + "loss": 0.013895762618631124, + "reward": 1.6789215207099915, + "reward_std": 0.15597553551197052, + "rewards/GDino": 0.7209441661834717, + "rewards/GIT": 0.31718890368938446, + "rewards/HPSv2": 0.26105499267578125, + "rewards/ORM": 0.37973345816135406, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 64 + }, + { + "completion_length": 63.59375, + "epoch": 0.07198228128460686, + "grad_norm": 0.4424923360347748, + "kl": 0.0020599365234375, + "learning_rate": 9.59375e-07, + "loss": 0.0005846736021339893, + "reward": 2.195925712585449, + "reward_std": 0.5788445174694061, + "rewards/GDino": 0.7169270515441895, + "rewards/GIT": 0.6367218196392059, + "rewards/HPSv2": 0.2345561981201172, + "rewards/ORM": 0.6077205836772919, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 65 + }, + { + "completion_length": 67.6875, + "epoch": 0.07308970099667775, + "grad_norm": 0.5050013661384583, + "kl": 0.00211334228515625, + "learning_rate": 9.5875e-07, + "loss": 0.010172993643209338, + "reward": 2.220258355140686, + "reward_std": 0.30588236451148987, + "rewards/GDino": 0.7442708909511566, + "rewards/GIT": 0.47482602298259735, + "rewards/HPSv2": 0.25937461853027344, + "rewards/ORM": 0.7417868673801422, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.5, + "step": 66 + }, + { + "completion_length": 72.75, + "epoch": 0.07419712070874862, + "grad_norm": 0.47647950053215027, + "kl": 0.001953125, + "learning_rate": 9.58125e-07, + "loss": 0.002580178901553154, + "reward": 2.3537763357162476, + "reward_std": 0.2857324182987213, + "rewards/GDino": 0.852263331413269, + "rewards/GIT": 0.5637244433164597, + "rewards/HPSv2": 0.2550220489501953, + "rewards/ORM": 0.6827665567398071, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.4375, + "step": 67 + }, + { + "completion_length": 60.109375, + "epoch": 0.0753045404208195, + "grad_norm": 0.45224544405937195, + "kl": 0.0021209716796875, + "learning_rate": 9.575e-07, + "loss": 0.002825574716553092, + "reward": 1.613221287727356, + "reward_std": 0.332104429602623, + "rewards/GDino": 0.6193348169326782, + "rewards/GIT": 0.2909398823976517, + "rewards/HPSv2": 0.2551765441894531, + "rewards/ORM": 0.4477700889110565, + "self_certainty_semantic": -25.25, + "self_certainty_token": -21.0625, + "step": 68 + }, + { + "completion_length": 72.6875, + "epoch": 0.07641196013289037, + "grad_norm": 0.688894510269165, + "kl": 0.002315521240234375, + "learning_rate": 9.56875e-07, + "loss": 0.012800770811736584, + "reward": 2.1092969179153442, + "reward_std": 0.36874186992645264, + "rewards/GDino": 0.8054687678813934, + "rewards/GIT": 0.3866874873638153, + "rewards/HPSv2": 0.26236534118652344, + "rewards/ORM": 0.6547753810882568, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 69 + }, + { + "completion_length": 60.640625, + "epoch": 0.07751937984496124, + "grad_norm": 0.45330390334129333, + "kl": 0.00215911865234375, + "learning_rate": 9.5625e-07, + "loss": -0.0010713667143136263, + "reward": 1.552397072315216, + "reward_std": 0.39455118775367737, + "rewards/GDino": 0.6554375886917114, + "rewards/GIT": 0.22663478553295135, + "rewards/HPSv2": 0.2546577453613281, + "rewards/ORM": 0.41566696763038635, + "self_certainty_semantic": -25.25, + "self_certainty_token": -20.75, + "step": 70 + }, + { + "completion_length": 76.515625, + "epoch": 0.07862679955703211, + "grad_norm": 0.5808414220809937, + "kl": 0.00222015380859375, + "learning_rate": 9.556249999999999e-07, + "loss": 0.0038980550598353148, + "reward": 1.9476300477981567, + "reward_std": 0.38603267073631287, + "rewards/GDino": 0.7262610197067261, + "rewards/GIT": 0.30087296664714813, + "rewards/HPSv2": 0.26424598693847656, + "rewards/ORM": 0.6562500149011612, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -19.9375, + "step": 71 + }, + { + "completion_length": 57.15625, + "epoch": 0.07973421926910298, + "grad_norm": 0.3693688213825226, + "kl": 0.00208282470703125, + "learning_rate": 9.55e-07, + "loss": -0.00035159417893737555, + "reward": 1.9391373991966248, + "reward_std": 0.3963821530342102, + "rewards/GDino": 0.6879567801952362, + "rewards/GIT": 0.4622843265533447, + "rewards/HPSv2": 0.24675464630126953, + "rewards/ORM": 0.5421415567398071, + "self_certainty_semantic": -25.0625, + "self_certainty_token": -20.9375, + "step": 72 + }, + { + "completion_length": 66.65625, + "epoch": 0.08084163898117387, + "grad_norm": 0.6215986013412476, + "kl": 0.0024871826171875, + "learning_rate": 9.543749999999999e-07, + "loss": 0.003838272183202207, + "reward": 2.1008963584899902, + "reward_std": 0.4600249230861664, + "rewards/GDino": 0.8240202069282532, + "rewards/GIT": 0.48449917137622833, + "rewards/HPSv2": 0.24818038940429688, + "rewards/ORM": 0.5441965609788895, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 73 + }, + { + "completion_length": 60.859375, + "epoch": 0.08194905869324474, + "grad_norm": 0.43593713641166687, + "kl": 0.0030364990234375, + "learning_rate": 9.5375e-07, + "loss": 0.002844013855792582, + "reward": 2.297879934310913, + "reward_std": 0.2846696451306343, + "rewards/GDino": 0.84375, + "rewards/GIT": 0.5265894532203674, + "rewards/HPSv2": 0.2544116973876953, + "rewards/ORM": 0.6731287837028503, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.75, + "step": 74 + }, + { + "completion_length": 68.703125, + "epoch": 0.08305647840531562, + "grad_norm": 0.48668116331100464, + "kl": 0.002227783203125, + "learning_rate": 9.53125e-07, + "loss": -0.0021062323357909918, + "reward": 1.7519539594650269, + "reward_std": 0.3109753131866455, + "rewards/GDino": 0.6498888432979584, + "rewards/GIT": 0.2745012864470482, + "rewards/HPSv2": 0.26706886291503906, + "rewards/ORM": 0.5604948848485947, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.625, + "step": 75 + }, + { + "completion_length": 70.25, + "epoch": 0.08416389811738649, + "grad_norm": 0.5122522711753845, + "kl": 0.00208282470703125, + "learning_rate": 9.525e-07, + "loss": -0.00045439647510647774, + "reward": 2.371267318725586, + "reward_std": 0.4085633456707001, + "rewards/GDino": 0.8135416805744171, + "rewards/GIT": 0.6540948301553726, + "rewards/HPSv2": 0.2650108337402344, + "rewards/ORM": 0.6386198997497559, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.75, + "step": 76 + }, + { + "completion_length": 62.875, + "epoch": 0.08527131782945736, + "grad_norm": 0.505736768245697, + "kl": 0.0037689208984375, + "learning_rate": 9.51875e-07, + "loss": -0.006699402409140021, + "reward": 1.5121636986732483, + "reward_std": 0.5349836349487305, + "rewards/GDino": 0.616510659456253, + "rewards/GIT": 0.18113864213228226, + "rewards/HPSv2": 0.228485107421875, + "rewards/ORM": 0.48602940142154694, + "self_certainty_semantic": -25.125, + "self_certainty_token": -21.875, + "step": 77 + }, + { + "completion_length": 65.8125, + "epoch": 0.08637873754152824, + "grad_norm": 0.4759610593318939, + "kl": 0.0022735595703125, + "learning_rate": 9.5125e-07, + "loss": 0.0014968996401876211, + "reward": 1.9482250213623047, + "reward_std": 0.38150524348020554, + "rewards/GDino": 0.7646995186805725, + "rewards/GIT": 0.31973105669021606, + "rewards/HPSv2": 0.2705249786376953, + "rewards/ORM": 0.5932694524526596, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 78 + }, + { + "completion_length": 72.609375, + "epoch": 0.08748615725359911, + "grad_norm": 0.4961722195148468, + "kl": 0.00247955322265625, + "learning_rate": 9.50625e-07, + "loss": 0.00820195721462369, + "reward": 2.2431598901748657, + "reward_std": 0.19805177673697472, + "rewards/GDino": 0.8183182775974274, + "rewards/GIT": 0.60882468521595, + "rewards/HPSv2": 0.2628040313720703, + "rewards/ORM": 0.5532128810882568, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5625, + "step": 79 + }, + { + "completion_length": 66.0625, + "epoch": 0.08859357696567, + "grad_norm": 0.5290701389312744, + "kl": 0.00308990478515625, + "learning_rate": 9.499999999999999e-07, + "loss": -0.001018086913973093, + "reward": 1.7054139375686646, + "reward_std": 0.4478110671043396, + "rewards/GDino": 0.6419965624809265, + "rewards/GIT": 0.19029075652360916, + "rewards/HPSv2": 0.2727680206298828, + "rewards/ORM": 0.6003586649894714, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.5, + "step": 80 + }, + { + "completion_length": 69.75, + "epoch": 0.08970099667774087, + "grad_norm": 0.530961811542511, + "kl": 0.00331878662109375, + "learning_rate": 9.493749999999999e-07, + "loss": -0.0018104221671819687, + "reward": 2.1294270157814026, + "reward_std": 0.30140096694231033, + "rewards/GDino": 0.7601194977760315, + "rewards/GIT": 0.36138176918029785, + "rewards/HPSv2": 0.27007102966308594, + "rewards/ORM": 0.7378547042608261, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.25, + "step": 81 + }, + { + "completion_length": 62.25, + "epoch": 0.09080841638981174, + "grad_norm": 0.5380280017852783, + "kl": 0.0029449462890625, + "learning_rate": 9.487499999999999e-07, + "loss": 0.0027263425290584564, + "reward": 1.7531540989875793, + "reward_std": 0.40144187211990356, + "rewards/GDino": 0.6388830840587616, + "rewards/GIT": 0.3787819594144821, + "rewards/HPSv2": 0.26526451110839844, + "rewards/ORM": 0.4702245742082596, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 82 + }, + { + "completion_length": 57.125, + "epoch": 0.09191583610188261, + "grad_norm": 0.46656447649002075, + "kl": 0.00229644775390625, + "learning_rate": 9.481249999999999e-07, + "loss": 0.0034079640172421932, + "reward": 2.1076533794403076, + "reward_std": 0.3496774584054947, + "rewards/GDino": 0.8086712956428528, + "rewards/GIT": 0.44665491580963135, + "rewards/HPSv2": 0.2527198791503906, + "rewards/ORM": 0.5996073186397552, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.875, + "step": 83 + }, + { + "completion_length": 77.609375, + "epoch": 0.09302325581395349, + "grad_norm": 0.7098491787910461, + "kl": 0.003326416015625, + "learning_rate": 9.474999999999999e-07, + "loss": -0.015582434833049774, + "reward": 2.0792417526245117, + "reward_std": 0.405472531914711, + "rewards/GDino": 0.8217203617095947, + "rewards/GIT": 0.6337592005729675, + "rewards/HPSv2": 0.2409496307373047, + "rewards/ORM": 0.3828125, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.25, + "step": 84 + }, + { + "completion_length": 70.0, + "epoch": 0.09413067552602436, + "grad_norm": 0.453952431678772, + "kl": 0.0030059814453125, + "learning_rate": 9.468749999999999e-07, + "loss": -0.008341801585629582, + "reward": 1.7731398940086365, + "reward_std": 0.43146421015262604, + "rewards/GDino": 0.6217962503433228, + "rewards/GIT": 0.33136892318725586, + "rewards/HPSv2": 0.2414989471435547, + "rewards/ORM": 0.5784757435321808, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.8125, + "step": 85 + }, + { + "completion_length": 55.46875, + "epoch": 0.09523809523809523, + "grad_norm": 0.6065813302993774, + "kl": 0.0029296875, + "learning_rate": 9.462499999999999e-07, + "loss": -0.004339609295129776, + "reward": 2.3409087657928467, + "reward_std": 0.33414456248283386, + "rewards/GDino": 0.843651682138443, + "rewards/GIT": 0.3478253483772278, + "rewards/HPSv2": 0.2929649353027344, + "rewards/ORM": 0.8564667999744415, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 86 + }, + { + "completion_length": 71.796875, + "epoch": 0.09634551495016612, + "grad_norm": 0.6815423965454102, + "kl": 0.0028076171875, + "learning_rate": 9.45625e-07, + "loss": 0.004890406038612127, + "reward": 2.096968352794647, + "reward_std": 0.4522961378097534, + "rewards/GDino": 0.7090134918689728, + "rewards/GIT": 0.4619881361722946, + "rewards/HPSv2": 0.26172447204589844, + "rewards/ORM": 0.6642423272132874, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 87 + }, + { + "completion_length": 62.921875, + "epoch": 0.09745293466223699, + "grad_norm": 0.37047135829925537, + "kl": 0.00237274169921875, + "learning_rate": 9.45e-07, + "loss": -0.007989626843482256, + "reward": 2.100303888320923, + "reward_std": 0.39728429913520813, + "rewards/GDino": 0.8100375235080719, + "rewards/GIT": 0.4551214128732681, + "rewards/HPSv2": 0.2669391632080078, + "rewards/ORM": 0.5682056248188019, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.625, + "step": 88 + }, + { + "completion_length": 61.25, + "epoch": 0.09856035437430787, + "grad_norm": 0.3903006613254547, + "kl": 0.0033111572265625, + "learning_rate": 9.44375e-07, + "loss": -0.0016460134647786617, + "reward": 2.1185483932495117, + "reward_std": 0.34406720101833344, + "rewards/GDino": 0.7301153540611267, + "rewards/GIT": 0.4342738687992096, + "rewards/HPSv2": 0.25724220275878906, + "rewards/ORM": 0.6969169676303864, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 89 + }, + { + "completion_length": 64.734375, + "epoch": 0.09966777408637874, + "grad_norm": 0.6106704473495483, + "kl": 0.002532958984375, + "learning_rate": 9.4375e-07, + "loss": 0.0018994538113474846, + "reward": 2.281058669090271, + "reward_std": 0.4019897133111954, + "rewards/GDino": 0.8515625298023224, + "rewards/GIT": 0.602006196975708, + "rewards/HPSv2": 0.2570476531982422, + "rewards/ORM": 0.5704423487186432, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.875, + "step": 90 + }, + { + "completion_length": 70.625, + "epoch": 0.10077519379844961, + "grad_norm": 0.6082563996315002, + "kl": 0.0025634765625, + "learning_rate": 9.43125e-07, + "loss": -0.001378488726913929, + "reward": 1.7446696758270264, + "reward_std": 0.48222504556179047, + "rewards/GDino": 0.6369770467281342, + "rewards/GIT": 0.4495050609111786, + "rewards/HPSv2": 0.2379169464111328, + "rewards/ORM": 0.42027057707309723, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.25, + "step": 91 + }, + { + "completion_length": 69.328125, + "epoch": 0.10188261351052048, + "grad_norm": 0.3885723054409027, + "kl": 0.00247955322265625, + "learning_rate": 9.425e-07, + "loss": 0.0029599489644169807, + "reward": 1.6940485835075378, + "reward_std": 0.48791858553886414, + "rewards/GDino": 0.7451692521572113, + "rewards/GIT": 0.3888908475637436, + "rewards/HPSv2": 0.23882293701171875, + "rewards/ORM": 0.32116562128067017, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.9375, + "step": 92 + }, + { + "completion_length": 78.96875, + "epoch": 0.10299003322259136, + "grad_norm": 2.441729784011841, + "kl": 0.00281524658203125, + "learning_rate": 9.41875e-07, + "loss": 0.0027102059684693813, + "reward": 2.098644495010376, + "reward_std": 0.5861929953098297, + "rewards/GDino": 0.7753971815109253, + "rewards/GIT": 0.33432240784168243, + "rewards/HPSv2": 0.24440956115722656, + "rewards/ORM": 0.7445152401924133, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.3125, + "step": 93 + }, + { + "completion_length": 53.640625, + "epoch": 0.10409745293466224, + "grad_norm": 1.843809962272644, + "kl": 0.00298309326171875, + "learning_rate": 9.4125e-07, + "loss": -0.002976842690259218, + "reward": 2.022274136543274, + "reward_std": 0.3149227201938629, + "rewards/GDino": 0.7854060530662537, + "rewards/GIT": 0.20830318331718445, + "rewards/HPSv2": 0.2829475402832031, + "rewards/ORM": 0.7456172108650208, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.4375, + "step": 94 + }, + { + "completion_length": 73.8125, + "epoch": 0.10520487264673312, + "grad_norm": 0.4806905686855316, + "kl": 0.0027923583984375, + "learning_rate": 9.40625e-07, + "loss": 0.0057201930321753025, + "reward": 2.5528862476348877, + "reward_std": 0.3981771767139435, + "rewards/GDino": 0.9458979666233063, + "rewards/GIT": 0.7319882810115814, + "rewards/HPSv2": 0.265625, + "rewards/ORM": 0.609375, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.375, + "step": 95 + }, + { + "completion_length": 71.578125, + "epoch": 0.10631229235880399, + "grad_norm": 1.3328330516815186, + "kl": 0.00286865234375, + "learning_rate": 9.399999999999999e-07, + "loss": 0.006992874434217811, + "reward": 2.4351861476898193, + "reward_std": 0.25794728100299835, + "rewards/GDino": 0.9020833373069763, + "rewards/GIT": 0.6907803118228912, + "rewards/HPSv2": 0.2606678009033203, + "rewards/ORM": 0.5816546380519867, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.125, + "step": 96 + }, + { + "completion_length": 60.703125, + "epoch": 0.10741971207087486, + "grad_norm": 0.5019268989562988, + "kl": 0.003326416015625, + "learning_rate": 9.393749999999999e-07, + "loss": 0.011835527839139104, + "reward": 1.6200063824653625, + "reward_std": 0.4240207076072693, + "rewards/GDino": 0.6504360437393188, + "rewards/GIT": 0.18544349074363708, + "rewards/HPSv2": 0.2720832824707031, + "rewards/ORM": 0.5120435357093811, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.4375, + "step": 97 + }, + { + "completion_length": 68.578125, + "epoch": 0.10852713178294573, + "grad_norm": 0.38334423303604126, + "kl": 0.003143310546875, + "learning_rate": 9.387499999999999e-07, + "loss": 0.0015034456737339497, + "reward": 1.9381686449050903, + "reward_std": 0.46784070134162903, + "rewards/GDino": 0.7850436270236969, + "rewards/GIT": 0.3971538841724396, + "rewards/HPSv2": 0.2517681121826172, + "rewards/ORM": 0.5042029470205307, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0, + "step": 98 + }, + { + "completion_length": 72.234375, + "epoch": 0.10963455149501661, + "grad_norm": 1.5332801342010498, + "kl": 0.0026702880859375, + "learning_rate": 9.381249999999999e-07, + "loss": 0.0014210238587111235, + "reward": 2.1606199741363525, + "reward_std": 0.4609396979212761, + "rewards/GDino": 0.800000011920929, + "rewards/GIT": 0.6965132355690002, + "rewards/HPSv2": 0.2425823211669922, + "rewards/ORM": 0.4215243309736252, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.125, + "step": 99 + }, + { + "completion_length": 64.859375, + "epoch": 0.11074197120708748, + "grad_norm": 0.4810887575149536, + "kl": 0.0039520263671875, + "learning_rate": 9.374999999999999e-07, + "loss": -0.006660776911303401, + "reward": 2.0300318002700806, + "reward_std": 0.49300554394721985, + "rewards/GDino": 0.6639764606952667, + "rewards/GIT": 0.41904042661190033, + "rewards/HPSv2": 0.25483131408691406, + "rewards/ORM": 0.6921834945678711, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5, + "step": 100 + }, + { + "completion_length": 59.671875, + "epoch": 0.11184939091915837, + "grad_norm": 0.6347000002861023, + "kl": 0.0032196044921875, + "learning_rate": 9.368749999999999e-07, + "loss": 0.007826576009392738, + "reward": 2.343237042427063, + "reward_std": 0.29696404933929443, + "rewards/GDino": 0.8815763592720032, + "rewards/GIT": 0.5084297135472298, + "rewards/HPSv2": 0.27715301513671875, + "rewards/ORM": 0.6760779917240143, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -20.4375, + "step": 101 + }, + { + "completion_length": 54.6875, + "epoch": 0.11295681063122924, + "grad_norm": 0.433162659406662, + "kl": 0.00323486328125, + "learning_rate": 9.3625e-07, + "loss": -0.0018342176917940378, + "reward": 2.244241714477539, + "reward_std": 0.3847181349992752, + "rewards/GDino": 0.7636502981185913, + "rewards/GIT": 0.5041892230510712, + "rewards/HPSv2": 0.26613616943359375, + "rewards/ORM": 0.7102660238742828, + "self_certainty_semantic": -25.375, + "self_certainty_token": -21.75, + "step": 102 + }, + { + "completion_length": 67.0, + "epoch": 0.11406423034330011, + "grad_norm": 0.4709942042827606, + "kl": 0.0036163330078125, + "learning_rate": 9.35625e-07, + "loss": -0.0053715279791504145, + "reward": 1.7866063117980957, + "reward_std": 0.48569220304489136, + "rewards/GDino": 0.6912583708763123, + "rewards/GIT": 0.2119271606206894, + "rewards/HPSv2": 0.26636314392089844, + "rewards/ORM": 0.6170576214790344, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.6875, + "step": 103 + }, + { + "completion_length": 72.9375, + "epoch": 0.11517165005537099, + "grad_norm": 0.4063447415828705, + "kl": 0.00260162353515625, + "learning_rate": 9.35e-07, + "loss": 0.002629161812365055, + "reward": 2.2642691135406494, + "reward_std": 0.34077706933021545, + "rewards/GDino": 0.83519247174263, + "rewards/GIT": 0.5088042318820953, + "rewards/HPSv2": 0.2578144073486328, + "rewards/ORM": 0.6624580323696136, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.25, + "step": 104 + }, + { + "completion_length": 60.078125, + "epoch": 0.11627906976744186, + "grad_norm": 0.46488699316978455, + "kl": 0.002288818359375, + "learning_rate": 9.34375e-07, + "loss": -0.003600445226766169, + "reward": 2.1485623121261597, + "reward_std": 0.4569554626941681, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.5468153655529022, + "rewards/HPSv2": 0.2572956085205078, + "rewards/ORM": 0.586638867855072, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 105 + }, + { + "completion_length": 82.796875, + "epoch": 0.11738648947951273, + "grad_norm": 0.6562625765800476, + "kl": 0.00269317626953125, + "learning_rate": 9.3375e-07, + "loss": 0.006768202409148216, + "reward": 1.9783158898353577, + "reward_std": 0.1888652741909027, + "rewards/GDino": 0.7153646051883698, + "rewards/GIT": 0.5914923697710037, + "rewards/HPSv2": 0.2652587890625, + "rewards/ORM": 0.4062000662088394, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0625, + "step": 106 + }, + { + "completion_length": 65.59375, + "epoch": 0.1184939091915836, + "grad_norm": 0.45307597517967224, + "kl": 0.003082275390625, + "learning_rate": 9.33125e-07, + "loss": 0.004376767203211784, + "reward": 2.5454152822494507, + "reward_std": 0.3043108731508255, + "rewards/GDino": 0.9536458253860474, + "rewards/GIT": 0.7616239190101624, + "rewards/HPSv2": 0.25897979736328125, + "rewards/ORM": 0.5711656212806702, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.6875, + "step": 107 + }, + { + "completion_length": 61.734375, + "epoch": 0.11960132890365449, + "grad_norm": 0.41155651211738586, + "kl": 0.0034942626953125, + "learning_rate": 9.325e-07, + "loss": 0.00791933387517929, + "reward": 2.225056529045105, + "reward_std": 0.2606152221560478, + "rewards/GDino": 0.7756550312042236, + "rewards/GIT": 0.44980524480342865, + "rewards/HPSv2": 0.2855796813964844, + "rewards/ORM": 0.7140165567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.875, + "step": 108 + }, + { + "completion_length": 62.3125, + "epoch": 0.12070874861572536, + "grad_norm": 0.5856253504753113, + "kl": 0.00328826904296875, + "learning_rate": 9.31875e-07, + "loss": -0.014065259601920843, + "reward": 2.116065502166748, + "reward_std": 0.42074093222618103, + "rewards/GDino": 0.8158511817455292, + "rewards/GIT": 0.5546791851520538, + "rewards/HPSv2": 0.26972389221191406, + "rewards/ORM": 0.4758111536502838, + "self_certainty_semantic": -25.625, + "self_certainty_token": -22.0625, + "step": 109 + }, + { + "completion_length": 53.6875, + "epoch": 0.12181616832779624, + "grad_norm": 0.47900426387786865, + "kl": 0.00299835205078125, + "learning_rate": 9.3125e-07, + "loss": 0.004598683924996294, + "reward": 2.2211345434188843, + "reward_std": 0.4559909552335739, + "rewards/GDino": 0.843098521232605, + "rewards/GIT": 0.39484143257141113, + "rewards/HPSv2": 0.23913192749023438, + "rewards/ORM": 0.7440627217292786, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.0625, + "step": 110 + }, + { + "completion_length": 62.5625, + "epoch": 0.12292358803986711, + "grad_norm": 0.5505498051643372, + "kl": 0.00334930419921875, + "learning_rate": 9.30625e-07, + "loss": -0.009575113654136658, + "reward": 1.8931084871292114, + "reward_std": 0.3895595818758011, + "rewards/GDino": 0.6988297700881958, + "rewards/GIT": 0.34851039946079254, + "rewards/HPSv2": 0.2725067138671875, + "rewards/ORM": 0.5732617080211639, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -22.1875, + "step": 111 + }, + { + "completion_length": 66.125, + "epoch": 0.12403100775193798, + "grad_norm": 0.5518302321434021, + "kl": 0.0044097900390625, + "learning_rate": 9.3e-07, + "loss": 0.001083985436707735, + "reward": 2.1159579753875732, + "reward_std": 0.3097255080938339, + "rewards/GDino": 0.7588914632797241, + "rewards/GIT": 0.3177434876561165, + "rewards/HPSv2": 0.2764263153076172, + "rewards/ORM": 0.7628966867923737, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.125, + "step": 112 + }, + { + "completion_length": 57.203125, + "epoch": 0.12513842746400886, + "grad_norm": 0.5670230388641357, + "kl": 0.00327301025390625, + "learning_rate": 9.293749999999999e-07, + "loss": 0.013281037099659443, + "reward": 1.6267165541648865, + "reward_std": 0.36898210644721985, + "rewards/GDino": 0.6410032212734222, + "rewards/GIT": 0.2818482890725136, + "rewards/HPSv2": 0.26859092712402344, + "rewards/ORM": 0.4352741092443466, + "self_certainty_semantic": -25.3125, + "self_certainty_token": -21.25, + "step": 113 + }, + { + "completion_length": 68.296875, + "epoch": 0.12624584717607973, + "grad_norm": 0.6704270243644714, + "kl": 0.00307464599609375, + "learning_rate": 9.287499999999999e-07, + "loss": 0.00015758577501401305, + "reward": 2.3069713711738586, + "reward_std": 0.36960119009017944, + "rewards/GDino": 0.7588542103767395, + "rewards/GIT": 0.6726887226104736, + "rewards/HPSv2": 0.2751197814941406, + "rewards/ORM": 0.6003087162971497, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.25, + "step": 114 + }, + { + "completion_length": 63.890625, + "epoch": 0.1273532668881506, + "grad_norm": 0.6844286918640137, + "kl": 0.00408935546875, + "learning_rate": 9.281249999999999e-07, + "loss": 0.0020853045862168074, + "reward": 2.1885178685188293, + "reward_std": 0.35547153651714325, + "rewards/GDino": 0.718020498752594, + "rewards/GIT": 0.5492343008518219, + "rewards/HPSv2": 0.2481842041015625, + "rewards/ORM": 0.6730788052082062, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -20.75, + "step": 115 + }, + { + "completion_length": 75.21875, + "epoch": 0.12846068660022147, + "grad_norm": 0.5827351212501526, + "kl": 0.003021240234375, + "learning_rate": 9.274999999999999e-07, + "loss": 0.0005021943943575025, + "reward": 2.2085607051849365, + "reward_std": 0.391997292637825, + "rewards/GDino": 0.7475058436393738, + "rewards/GIT": 0.5436886698007584, + "rewards/HPSv2": 0.26111602783203125, + "rewards/ORM": 0.65625, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.75, + "step": 116 + }, + { + "completion_length": 66.234375, + "epoch": 0.12956810631229235, + "grad_norm": 8.78965950012207, + "kl": 0.158905029296875, + "learning_rate": 9.268749999999999e-07, + "loss": -0.0129257976077497, + "reward": 2.4095414876937866, + "reward_std": 0.2911904752254486, + "rewards/GDino": 0.8304687738418579, + "rewards/GIT": 0.6444451212882996, + "rewards/HPSv2": 0.27797698974609375, + "rewards/ORM": 0.6566506326198578, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.75, + "step": 117 + }, + { + "completion_length": 64.578125, + "epoch": 0.13067552602436322, + "grad_norm": 0.6560596823692322, + "kl": 0.00417327880859375, + "learning_rate": 9.2625e-07, + "loss": 0.0029480335651896894, + "reward": 1.8815761804580688, + "reward_std": 0.3823118060827255, + "rewards/GDino": 0.7314696907997131, + "rewards/GIT": 0.41885554790496826, + "rewards/HPSv2": 0.24540138244628906, + "rewards/ORM": 0.4858495891094208, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.625, + "step": 118 + }, + { + "completion_length": 56.828125, + "epoch": 0.13178294573643412, + "grad_norm": 1.9917776584625244, + "kl": 0.0042877197265625, + "learning_rate": 9.25625e-07, + "loss": -0.01110410038381815, + "reward": 2.270492196083069, + "reward_std": 0.5458246767520905, + "rewards/GDino": 0.7566670179367065, + "rewards/GIT": 0.5055328160524368, + "rewards/HPSv2": 0.26803016662597656, + "rewards/ORM": 0.7402622997760773, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.875, + "step": 119 + }, + { + "completion_length": 72.3125, + "epoch": 0.132890365448505, + "grad_norm": 0.510168194770813, + "kl": 0.00420379638671875, + "learning_rate": 9.25e-07, + "loss": -0.013864397071301937, + "reward": 1.973584771156311, + "reward_std": 0.4184395670890808, + "rewards/GDino": 0.7117854058742523, + "rewards/GIT": 0.43370192497968674, + "rewards/HPSv2": 0.26166534423828125, + "rewards/ORM": 0.5664321482181549, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.3125, + "step": 120 + }, + { + "completion_length": 77.171875, + "epoch": 0.13399778516057587, + "grad_norm": 0.5348736643791199, + "kl": 0.00298309326171875, + "learning_rate": 9.243749999999999e-07, + "loss": 0.004201958421617746, + "reward": 1.9280533194541931, + "reward_std": 0.4291805773973465, + "rewards/GDino": 0.7109375, + "rewards/GIT": 0.38363416492938995, + "rewards/HPSv2": 0.25235748291015625, + "rewards/ORM": 0.5811240971088409, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0625, + "step": 121 + }, + { + "completion_length": 70.53125, + "epoch": 0.13510520487264674, + "grad_norm": 0.49879971146583557, + "kl": 0.00412750244140625, + "learning_rate": 9.237499999999999e-07, + "loss": -0.0026759039610624313, + "reward": 1.9971369504928589, + "reward_std": 0.2551337629556656, + "rewards/GDino": 0.72983318567276, + "rewards/GIT": 0.34402593970298767, + "rewards/HPSv2": 0.2877368927001953, + "rewards/ORM": 0.6355408430099487, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.125, + "step": 122 + }, + { + "completion_length": 64.640625, + "epoch": 0.1362126245847176, + "grad_norm": 0.4230790436267853, + "kl": 0.00341033935546875, + "learning_rate": 9.23125e-07, + "loss": -0.002337672747671604, + "reward": 2.0281134843826294, + "reward_std": 0.3781726509332657, + "rewards/GDino": 0.7874999940395355, + "rewards/GIT": 0.4591221511363983, + "rewards/HPSv2": 0.2555961608886719, + "rewards/ORM": 0.5258950889110565, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.5625, + "step": 123 + }, + { + "completion_length": 59.375, + "epoch": 0.13732004429678848, + "grad_norm": 0.9666682481765747, + "kl": 0.00328826904296875, + "learning_rate": 9.225e-07, + "loss": -0.010707761626690626, + "reward": 2.219977855682373, + "reward_std": 0.396147683262825, + "rewards/GDino": 0.7934323251247406, + "rewards/GIT": 0.4874458909034729, + "rewards/HPSv2": 0.2524528503417969, + "rewards/ORM": 0.6866468489170074, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 124 + }, + { + "completion_length": 67.421875, + "epoch": 0.13842746400885936, + "grad_norm": 0.4701387286186218, + "kl": 0.00374603271484375, + "learning_rate": 9.21875e-07, + "loss": -0.008014392806217074, + "reward": 2.166910171508789, + "reward_std": 0.44899792969226837, + "rewards/GDino": 0.7873771488666534, + "rewards/GIT": 0.5715728402137756, + "rewards/HPSv2": 0.25487709045410156, + "rewards/ORM": 0.5530830323696136, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.9375, + "step": 125 + }, + { + "completion_length": 60.46875, + "epoch": 0.13953488372093023, + "grad_norm": 0.6960640549659729, + "kl": 0.0052337646484375, + "learning_rate": 9.2125e-07, + "loss": 0.005524930078536272, + "reward": 1.941537857055664, + "reward_std": 0.3068820387125015, + "rewards/GDino": 0.69914710521698, + "rewards/GIT": 0.31967807561159134, + "rewards/HPSv2": 0.26458740234375, + "rewards/ORM": 0.6581252217292786, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 126 + }, + { + "completion_length": 65.90625, + "epoch": 0.1406423034330011, + "grad_norm": 0.5266240239143372, + "kl": 0.0050506591796875, + "learning_rate": 9.20625e-07, + "loss": -0.008795970119535923, + "reward": 2.2745760679244995, + "reward_std": 0.35941246151924133, + "rewards/GDino": 0.7357383072376251, + "rewards/GIT": 0.42085812985897064, + "rewards/HPSv2": 0.2789630889892578, + "rewards/ORM": 0.8390165567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.8125, + "step": 127 + }, + { + "completion_length": 62.0, + "epoch": 0.14174972314507198, + "grad_norm": 1.2693217992782593, + "kl": 0.00701904296875, + "learning_rate": 9.2e-07, + "loss": -0.013476235326379538, + "reward": 1.8667319416999817, + "reward_std": 0.5579482614994049, + "rewards/GDino": 0.6687500178813934, + "rewards/GIT": 0.240242637693882, + "rewards/HPSv2": 0.2608222961425781, + "rewards/ORM": 0.6969169527292252, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.8125, + "step": 128 + }, + { + "completion_length": 72.53125, + "epoch": 0.14285714285714285, + "grad_norm": 1.4665846824645996, + "kl": 0.0047454833984375, + "learning_rate": 9.19375e-07, + "loss": -0.006278489250689745, + "reward": 2.076420545578003, + "reward_std": 0.36895356327295303, + "rewards/GDino": 0.739062488079071, + "rewards/GIT": 0.41109369695186615, + "rewards/HPSv2": 0.2513103485107422, + "rewards/ORM": 0.6749540567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.875, + "step": 129 + }, + { + "completion_length": 58.046875, + "epoch": 0.14396456256921372, + "grad_norm": 0.7384111285209656, + "kl": 0.00390625, + "learning_rate": 9.187499999999999e-07, + "loss": -0.0109781245701015, + "reward": 1.9833685159683228, + "reward_std": 0.39847198128700256, + "rewards/GDino": 0.7729166448116302, + "rewards/GIT": 0.4782646894454956, + "rewards/HPSv2": 0.24262619018554688, + "rewards/ORM": 0.48956090211868286, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.375, + "step": 130 + }, + { + "completion_length": 72.0, + "epoch": 0.1450719822812846, + "grad_norm": 0.46645256876945496, + "kl": 0.00476837158203125, + "learning_rate": 9.181249999999999e-07, + "loss": 0.006110590882599354, + "reward": 1.885680913925171, + "reward_std": 0.4655804932117462, + "rewards/GDino": 0.7249231338500977, + "rewards/GIT": 0.35940520465373993, + "rewards/HPSv2": 0.2583580017089844, + "rewards/ORM": 0.5429946184158325, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0625, + "step": 131 + }, + { + "completion_length": 53.21875, + "epoch": 0.1461794019933555, + "grad_norm": 0.5023438930511475, + "kl": 0.00583648681640625, + "learning_rate": 9.174999999999999e-07, + "loss": -0.0056219237158074975, + "reward": 2.1214953660964966, + "reward_std": 0.5559927821159363, + "rewards/GDino": 0.8054038286209106, + "rewards/GIT": 0.4245864748954773, + "rewards/HPSv2": 0.2713184356689453, + "rewards/ORM": 0.6201866269111633, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.0625, + "step": 132 + }, + { + "completion_length": 75.6875, + "epoch": 0.14728682170542637, + "grad_norm": 0.6622663140296936, + "kl": 0.00439453125, + "learning_rate": 9.168749999999999e-07, + "loss": 0.009899101918563247, + "reward": 2.593212366104126, + "reward_std": 0.17419864609837532, + "rewards/GDino": 0.7739583253860474, + "rewards/GIT": 0.6746057868003845, + "rewards/HPSv2": 0.2743816375732422, + "rewards/ORM": 0.8702665567398071, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.75, + "step": 133 + }, + { + "completion_length": 68.90625, + "epoch": 0.14839424141749724, + "grad_norm": 0.41897183656692505, + "kl": 0.0034942626953125, + "learning_rate": 9.1625e-07, + "loss": 0.002212307066656649, + "reward": 1.978962779045105, + "reward_std": 0.45697829127311707, + "rewards/GDino": 0.7175242900848389, + "rewards/GIT": 0.5035496056079865, + "rewards/HPSv2": 0.24994659423828125, + "rewards/ORM": 0.5079423785209656, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.3125, + "step": 134 + }, + { + "completion_length": 62.859375, + "epoch": 0.14950166112956811, + "grad_norm": 0.5371299386024475, + "kl": 0.00482177734375, + "learning_rate": 9.15625e-07, + "loss": 0.005879509728401899, + "reward": 2.0941214561462402, + "reward_std": 0.47014716267585754, + "rewards/GDino": 0.774738609790802, + "rewards/GIT": 0.4917849898338318, + "rewards/HPSv2": 0.267425537109375, + "rewards/ORM": 0.5601723045110703, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -20.1875, + "step": 135 + }, + { + "completion_length": 76.578125, + "epoch": 0.150609080841639, + "grad_norm": 0.48601874709129333, + "kl": 0.004486083984375, + "learning_rate": 9.15e-07, + "loss": -0.0003573829308152199, + "reward": 1.8426015377044678, + "reward_std": 0.2483576349914074, + "rewards/GDino": 0.684923529624939, + "rewards/GIT": 0.3237183541059494, + "rewards/HPSv2": 0.2632465362548828, + "rewards/ORM": 0.5707131624221802, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0625, + "step": 136 + }, + { + "completion_length": 70.078125, + "epoch": 0.15171650055370986, + "grad_norm": 0.5911806225776672, + "kl": 0.0052947998046875, + "learning_rate": 9.14375e-07, + "loss": -0.008954334072768688, + "reward": 2.0952707529067993, + "reward_std": 0.42313070595264435, + "rewards/GDino": 0.7640625238418579, + "rewards/GIT": 0.5078665241599083, + "rewards/HPSv2": 0.25115394592285156, + "rewards/ORM": 0.5721877217292786, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.4375, + "step": 137 + }, + { + "completion_length": 61.75, + "epoch": 0.15282392026578073, + "grad_norm": 0.6094731688499451, + "kl": 0.00860595703125, + "learning_rate": 9.137499999999999e-07, + "loss": -0.00691208359785378, + "reward": 1.8424771428108215, + "reward_std": 0.3106200248003006, + "rewards/GDino": 0.6280561089515686, + "rewards/GIT": 0.2153022214770317, + "rewards/HPSv2": 0.2725563049316406, + "rewards/ORM": 0.7265625, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -21.0, + "step": 138 + }, + { + "completion_length": 76.21875, + "epoch": 0.1539313399778516, + "grad_norm": 0.7681946754455566, + "kl": 0.0045166015625, + "learning_rate": 9.131249999999999e-07, + "loss": 0.006304489565081894, + "reward": 2.0444042682647705, + "reward_std": 0.4021482616662979, + "rewards/GDino": 0.7844302356243134, + "rewards/GIT": 0.33466267585754395, + "rewards/HPSv2": 0.26512908935546875, + "rewards/ORM": 0.6601821780204773, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.0, + "step": 139 + }, + { + "completion_length": 64.78125, + "epoch": 0.15503875968992248, + "grad_norm": 0.404694527387619, + "kl": 0.00445556640625, + "learning_rate": 9.124999999999999e-07, + "loss": 0.0074170518782921135, + "reward": 2.199423849582672, + "reward_std": 0.3181084841489792, + "rewards/GDino": 0.8405935764312744, + "rewards/GIT": 0.5380776524543762, + "rewards/HPSv2": 0.2516937255859375, + "rewards/ORM": 0.5690587759017944, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.375, + "step": 140 + }, + { + "completion_length": 80.1875, + "epoch": 0.15614617940199335, + "grad_norm": 0.521449089050293, + "kl": 0.00370025634765625, + "learning_rate": 9.11875e-07, + "loss": 0.01646838476881385, + "reward": 2.4023600816726685, + "reward_std": 0.17732174694538116, + "rewards/GDino": 0.6875, + "rewards/GIT": 0.7328296601772308, + "rewards/HPSv2": 0.24770545959472656, + "rewards/ORM": 0.7343250513076782, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 141 + }, + { + "completion_length": 68.546875, + "epoch": 0.15725359911406422, + "grad_norm": 0.4444400370121002, + "kl": 0.006500244140625, + "learning_rate": 9.1125e-07, + "loss": -0.0020874282345175743, + "reward": 2.2395375967025757, + "reward_std": 0.37212860584259033, + "rewards/GDino": 0.7598958611488342, + "rewards/GIT": 0.5187265872955322, + "rewards/HPSv2": 0.2597951889038086, + "rewards/ORM": 0.7011198401451111, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 142 + }, + { + "completion_length": 60.453125, + "epoch": 0.1583610188261351, + "grad_norm": 0.5732141137123108, + "kl": 0.006134033203125, + "learning_rate": 9.10625e-07, + "loss": -0.0019202656112611294, + "reward": 1.9194607138633728, + "reward_std": 0.5088343024253845, + "rewards/GDino": 0.705212414264679, + "rewards/GIT": 0.3693596422672272, + "rewards/HPSv2": 0.2593517303466797, + "rewards/ORM": 0.5855368673801422, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.125, + "step": 143 + }, + { + "completion_length": 69.53125, + "epoch": 0.15946843853820597, + "grad_norm": 0.5136631727218628, + "kl": 0.00463104248046875, + "learning_rate": 9.1e-07, + "loss": -0.0024181478656828403, + "reward": 2.1130378246307373, + "reward_std": 0.3436143696308136, + "rewards/GDino": 0.6970658600330353, + "rewards/GIT": 0.5147460252046585, + "rewards/HPSv2": 0.2531890869140625, + "rewards/ORM": 0.6480368673801422, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.3125, + "step": 144 + }, + { + "completion_length": 63.234375, + "epoch": 0.16057585825027684, + "grad_norm": 0.425749808549881, + "kl": 0.0057220458984375, + "learning_rate": 9.09375e-07, + "loss": 0.0033237107563763857, + "reward": 1.907556176185608, + "reward_std": 0.3990510255098343, + "rewards/GDino": 0.7011643946170807, + "rewards/GIT": 0.3098641186952591, + "rewards/HPSv2": 0.28241920471191406, + "rewards/ORM": 0.6141084432601929, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.5625, + "step": 145 + }, + { + "completion_length": 62.953125, + "epoch": 0.16168327796234774, + "grad_norm": 0.5104310512542725, + "kl": 0.0064239501953125, + "learning_rate": 9.087499999999999e-07, + "loss": 0.010284929594490677, + "reward": 2.080387771129608, + "reward_std": 0.4294509291648865, + "rewards/GDino": 0.8376201391220093, + "rewards/GIT": 0.3540365919470787, + "rewards/HPSv2": 0.27114295959472656, + "rewards/ORM": 0.6175881326198578, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5, + "step": 146 + }, + { + "completion_length": 67.625, + "epoch": 0.16279069767441862, + "grad_norm": 0.5227380394935608, + "kl": 0.0070343017578125, + "learning_rate": 9.081249999999999e-07, + "loss": 0.003552068490535021, + "reward": 1.605971097946167, + "reward_std": 0.3158091753721237, + "rewards/GDino": 0.6382401585578918, + "rewards/GIT": 0.19080179929733276, + "rewards/HPSv2": 0.25063323974609375, + "rewards/ORM": 0.5262957215309143, + "self_certainty_semantic": -25.375, + "self_certainty_token": -22.9375, + "step": 147 + }, + { + "completion_length": 69.53125, + "epoch": 0.1638981173864895, + "grad_norm": 0.5913640260696411, + "kl": 0.008758544921875, + "learning_rate": 9.074999999999999e-07, + "loss": 0.0023775382433086634, + "reward": 2.265665352344513, + "reward_std": 0.3249353617429733, + "rewards/GDino": 0.8458716571331024, + "rewards/GIT": 0.38859403878450394, + "rewards/HPSv2": 0.27611541748046875, + "rewards/ORM": 0.7550841569900513, + "self_certainty_semantic": -25.9375, + "self_certainty_token": -21.625, + "step": 148 + }, + { + "completion_length": 69.390625, + "epoch": 0.16500553709856036, + "grad_norm": 0.6509791016578674, + "kl": 0.0075836181640625, + "learning_rate": 9.068749999999999e-07, + "loss": -0.010468412889167666, + "reward": 2.1014277935028076, + "reward_std": 0.29370661079883575, + "rewards/GDino": 0.7491666674613953, + "rewards/GIT": 0.3259096145629883, + "rewards/HPSv2": 0.2623310089111328, + "rewards/ORM": 0.7640205323696136, + "self_certainty_semantic": -25.75, + "self_certainty_token": -22.0, + "step": 149 + }, + { + "completion_length": 64.046875, + "epoch": 0.16611295681063123, + "grad_norm": 2.004599094390869, + "kl": 0.01568603515625, + "learning_rate": 9.0625e-07, + "loss": -0.003110084217041731, + "reward": 2.0497288703918457, + "reward_std": 0.46643751859664917, + "rewards/GDino": 0.7837072014808655, + "rewards/GIT": 0.31941479444503784, + "rewards/HPSv2": 0.2623157501220703, + "rewards/ORM": 0.6842910945415497, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.875, + "step": 150 + }, + { + "completion_length": 68.875, + "epoch": 0.1672203765227021, + "grad_norm": 1.2010647058486938, + "kl": 0.0079498291015625, + "learning_rate": 9.05625e-07, + "loss": 0.0036378083750605583, + "reward": 2.19494891166687, + "reward_std": 0.5349652469158173, + "rewards/GDino": 0.7948823869228363, + "rewards/GIT": 0.3874897435307503, + "rewards/HPSv2": 0.2666778564453125, + "rewards/ORM": 0.7458988428115845, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.75, + "step": 151 + }, + { + "completion_length": 65.46875, + "epoch": 0.16832779623477298, + "grad_norm": 0.4594494700431824, + "kl": 0.0051727294921875, + "learning_rate": 9.05e-07, + "loss": 0.0013301910366863012, + "reward": 2.1984575986862183, + "reward_std": 0.2301565483212471, + "rewards/GDino": 0.8368903398513794, + "rewards/GIT": 0.4207738786935806, + "rewards/HPSv2": 0.27980995178222656, + "rewards/ORM": 0.6609834432601929, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.3125, + "step": 152 + }, + { + "completion_length": 60.875, + "epoch": 0.16943521594684385, + "grad_norm": 0.584158182144165, + "kl": 0.006622314453125, + "learning_rate": 9.04375e-07, + "loss": -0.006514292559586465, + "reward": 2.2534468173980713, + "reward_std": 0.3471103012561798, + "rewards/GDino": 0.7832907140254974, + "rewards/GIT": 0.6241410374641418, + "rewards/HPSv2": 0.2647590637207031, + "rewards/ORM": 0.5812558829784393, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.8125, + "step": 153 + }, + { + "completion_length": 71.234375, + "epoch": 0.17054263565891473, + "grad_norm": 0.3877808153629303, + "kl": 0.0067138671875, + "learning_rate": 9.0375e-07, + "loss": -0.00840937439352274, + "reward": 1.5600855946540833, + "reward_std": 0.1888522505760193, + "rewards/GDino": 0.6892416477203369, + "rewards/GIT": 0.1894538253545761, + "rewards/HPSv2": 0.26103973388671875, + "rewards/ORM": 0.42035043239593506, + "self_certainty_semantic": -25.4375, + "self_certainty_token": -23.125, + "step": 154 + }, + { + "completion_length": 75.15625, + "epoch": 0.1716500553709856, + "grad_norm": 0.48354580998420715, + "kl": 0.0096588134765625, + "learning_rate": 9.031249999999999e-07, + "loss": 0.019050699658691883, + "reward": 2.116607189178467, + "reward_std": 0.290459081530571, + "rewards/GDino": 0.6718750298023224, + "rewards/GIT": 0.4389065280556679, + "rewards/HPSv2": 0.26484203338623047, + "rewards/ORM": 0.7409836649894714, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.625, + "step": 155 + }, + { + "completion_length": 70.28125, + "epoch": 0.17275747508305647, + "grad_norm": 0.48019152879714966, + "kl": 0.00909423828125, + "learning_rate": 9.024999999999999e-07, + "loss": -0.006820322363637388, + "reward": 1.7913519144058228, + "reward_std": 0.4075485020875931, + "rewards/GDino": 0.6470568478107452, + "rewards/GIT": 0.21577580273151398, + "rewards/HPSv2": 0.2772235870361328, + "rewards/ORM": 0.6512957215309143, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 156 + }, + { + "completion_length": 79.765625, + "epoch": 0.17386489479512734, + "grad_norm": 0.4524085223674774, + "kl": 0.0062713623046875, + "learning_rate": 9.018749999999999e-07, + "loss": -0.008496122900396585, + "reward": 2.5269054174423218, + "reward_std": 0.3125455528497696, + "rewards/GDino": 0.8450000286102295, + "rewards/GIT": 0.7050136923789978, + "rewards/HPSv2": 0.24599647521972656, + "rewards/ORM": 0.7308953106403351, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.1875, + "step": 157 + }, + { + "completion_length": 64.5, + "epoch": 0.17497231450719822, + "grad_norm": 0.43005651235580444, + "kl": 0.009552001953125, + "learning_rate": 9.0125e-07, + "loss": 0.005564866121858358, + "reward": 2.3001022338867188, + "reward_std": 0.2847408503293991, + "rewards/GDino": 0.8344532251358032, + "rewards/GIT": 0.420885294675827, + "rewards/HPSv2": 0.27132606506347656, + "rewards/ORM": 0.7734375298023224, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0, + "step": 158 + }, + { + "completion_length": 70.171875, + "epoch": 0.1760797342192691, + "grad_norm": 0.6674854159355164, + "kl": 0.009185791015625, + "learning_rate": 9.00625e-07, + "loss": 0.001701198983937502, + "reward": 2.222777843475342, + "reward_std": 0.4929357320070267, + "rewards/GDino": 0.7640625238418579, + "rewards/GIT": 0.48828309774398804, + "rewards/HPSv2": 0.2673072814941406, + "rewards/ORM": 0.703125, + "self_certainty_semantic": -25.5, + "self_certainty_token": -20.6875, + "step": 159 + }, + { + "completion_length": 64.421875, + "epoch": 0.17718715393134, + "grad_norm": 0.4401383399963379, + "kl": 0.008697509765625, + "learning_rate": 9e-07, + "loss": 0.0025870297104120255, + "reward": 1.7824512124061584, + "reward_std": 0.44338105618953705, + "rewards/GDino": 0.7084426283836365, + "rewards/GIT": 0.286900594830513, + "rewards/HPSv2": 0.2784423828125, + "rewards/ORM": 0.5086656212806702, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.0, + "step": 160 + }, + { + "completion_length": 69.328125, + "epoch": 0.17829457364341086, + "grad_norm": 0.6274824142456055, + "kl": 0.008209228515625, + "learning_rate": 8.99375e-07, + "loss": 0.006771775893867016, + "reward": 2.080656409263611, + "reward_std": 0.4039708971977234, + "rewards/GDino": 0.7284385859966278, + "rewards/GIT": 0.4118357300758362, + "rewards/HPSv2": 0.2606945037841797, + "rewards/ORM": 0.6796875, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 161 + }, + { + "completion_length": 87.765625, + "epoch": 0.17940199335548174, + "grad_norm": 0.713962972164154, + "kl": 0.00885009765625, + "learning_rate": 8.9875e-07, + "loss": 0.001781372120603919, + "reward": 2.2108030319213867, + "reward_std": 0.23567625507712364, + "rewards/GDino": 0.9036458432674408, + "rewards/GIT": 0.5173117220401764, + "rewards/HPSv2": 0.2613239288330078, + "rewards/ORM": 0.5285216420888901, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.875, + "step": 162 + }, + { + "completion_length": 65.375, + "epoch": 0.1805094130675526, + "grad_norm": 0.45745736360549927, + "kl": 0.010772705078125, + "learning_rate": 8.981249999999999e-07, + "loss": -0.001884209574200213, + "reward": 2.169035792350769, + "reward_std": 0.27702826261520386, + "rewards/GDino": 0.7578125, + "rewards/GIT": 0.6291100382804871, + "rewards/HPSv2": 0.24835586547851562, + "rewards/ORM": 0.5337574481964111, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.875, + "step": 163 + }, + { + "completion_length": 66.15625, + "epoch": 0.18161683277962348, + "grad_norm": 0.4001372456550598, + "kl": 0.011199951171875, + "learning_rate": 8.974999999999999e-07, + "loss": -0.004290862008929253, + "reward": 2.6795451641082764, + "reward_std": 0.3354812413454056, + "rewards/GDino": 0.8685008883476257, + "rewards/GIT": 0.7786318361759186, + "rewards/HPSv2": 0.27187156677246094, + "rewards/ORM": 0.7605409026145935, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -20.625, + "step": 164 + }, + { + "completion_length": 71.453125, + "epoch": 0.18272425249169436, + "grad_norm": 0.6596059799194336, + "kl": 0.00909423828125, + "learning_rate": 8.96875e-07, + "loss": -0.0067337434738874435, + "reward": 2.3466144800186157, + "reward_std": 0.29852450639009476, + "rewards/GDino": 0.8130539357662201, + "rewards/GIT": 0.49434708058834076, + "rewards/HPSv2": 0.2721138000488281, + "rewards/ORM": 0.7670996189117432, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.875, + "step": 165 + }, + { + "completion_length": 79.859375, + "epoch": 0.18383167220376523, + "grad_norm": 0.41807329654693604, + "kl": 0.01123046875, + "learning_rate": 8.9625e-07, + "loss": 0.010698896832764149, + "reward": 2.1671139001846313, + "reward_std": 0.37620842456817627, + "rewards/GDino": 0.7225366532802582, + "rewards/GIT": 0.46812044084072113, + "rewards/HPSv2": 0.2448101043701172, + "rewards/ORM": 0.7316466867923737, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.3125, + "step": 166 + }, + { + "completion_length": 68.921875, + "epoch": 0.1849390919158361, + "grad_norm": 0.4884219467639923, + "kl": 0.010955810546875, + "learning_rate": 8.95625e-07, + "loss": 0.0020176093094050884, + "reward": 1.979174256324768, + "reward_std": 0.43148648738861084, + "rewards/GDino": 0.7630714476108551, + "rewards/GIT": 0.49030545353889465, + "rewards/HPSv2": 0.2582511901855469, + "rewards/ORM": 0.46754617989063263, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -22.0, + "step": 167 + }, + { + "completion_length": 76.453125, + "epoch": 0.18604651162790697, + "grad_norm": 0.4840864837169647, + "kl": 0.00423431396484375, + "learning_rate": 8.95e-07, + "loss": -0.0033226923551410437, + "reward": 2.049097418785095, + "reward_std": 0.2925217002630234, + "rewards/GDino": 0.7759547531604767, + "rewards/GIT": 0.5475737899541855, + "rewards/HPSv2": 0.25574493408203125, + "rewards/ORM": 0.4698239266872406, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.4375, + "step": 168 + }, + { + "completion_length": 70.6875, + "epoch": 0.18715393133997785, + "grad_norm": 0.6547427773475647, + "kl": 0.0087890625, + "learning_rate": 8.94375e-07, + "loss": -0.00017379922792315483, + "reward": 2.19344425201416, + "reward_std": 0.3008778989315033, + "rewards/GDino": 0.8275851011276245, + "rewards/GIT": 0.45398683845996857, + "rewards/HPSv2": 0.2814655303955078, + "rewards/ORM": 0.6304067671298981, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.625, + "step": 169 + }, + { + "completion_length": 82.03125, + "epoch": 0.18826135105204872, + "grad_norm": 0.5040526390075684, + "kl": 0.0142364501953125, + "learning_rate": 8.9375e-07, + "loss": -0.007077913731336594, + "reward": 2.0542516708374023, + "reward_std": 0.3690732419490814, + "rewards/GDino": 0.7519437670707703, + "rewards/GIT": 0.40589363873004913, + "rewards/HPSv2": 0.2560100555419922, + "rewards/ORM": 0.6404041647911072, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.0, + "step": 170 + }, + { + "completion_length": 64.203125, + "epoch": 0.1893687707641196, + "grad_norm": 0.4935157299041748, + "kl": 0.012420654296875, + "learning_rate": 8.931249999999999e-07, + "loss": 0.0035545220598578453, + "reward": 2.274348735809326, + "reward_std": 0.2875422090291977, + "rewards/GDino": 0.7699261903762817, + "rewards/GIT": 0.5473942309617996, + "rewards/HPSv2": 0.2648448944091797, + "rewards/ORM": 0.6921834945678711, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.0, + "step": 171 + }, + { + "completion_length": 74.953125, + "epoch": 0.19047619047619047, + "grad_norm": 0.4935402274131775, + "kl": 0.0087738037109375, + "learning_rate": 8.924999999999999e-07, + "loss": 0.004996137693524361, + "reward": 1.6501405239105225, + "reward_std": 0.3322151154279709, + "rewards/GDino": 0.5804118067026138, + "rewards/GIT": 0.419575035572052, + "rewards/HPSv2": 0.25256919860839844, + "rewards/ORM": 0.39758437871932983, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.5625, + "step": 172 + }, + { + "completion_length": 63.296875, + "epoch": 0.19158361018826134, + "grad_norm": 1.0840739011764526, + "kl": 0.0174560546875, + "learning_rate": 8.918749999999999e-07, + "loss": 0.0033964416943490505, + "reward": 2.1245768666267395, + "reward_std": 0.29341885447502136, + "rewards/GDino": 0.8359375298023224, + "rewards/GIT": 0.3758692592382431, + "rewards/HPSv2": 0.2845611572265625, + "rewards/ORM": 0.6282089054584503, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0, + "step": 173 + }, + { + "completion_length": 80.53125, + "epoch": 0.19269102990033224, + "grad_norm": 0.4756031036376953, + "kl": 0.0066070556640625, + "learning_rate": 8.912499999999999e-07, + "loss": -0.001147494971519336, + "reward": 2.2244513034820557, + "reward_std": 0.3234108239412308, + "rewards/GDino": 0.7939131259918213, + "rewards/GIT": 0.5430482923984528, + "rewards/HPSv2": 0.2594108581542969, + "rewards/ORM": 0.6280790567398071, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 174 + }, + { + "completion_length": 63.796875, + "epoch": 0.1937984496124031, + "grad_norm": 0.8507784605026245, + "kl": 0.01806640625, + "learning_rate": 8.906249999999999e-07, + "loss": -0.0049158919136971235, + "reward": 2.211203694343567, + "reward_std": 0.30844441056251526, + "rewards/GDino": 0.7877604365348816, + "rewards/GIT": 0.5168893337249756, + "rewards/HPSv2": 0.2628498077392578, + "rewards/ORM": 0.6437040567398071, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.125, + "step": 175 + }, + { + "completion_length": 85.40625, + "epoch": 0.19490586932447398, + "grad_norm": 0.4818137586116791, + "kl": 0.00640869140625, + "learning_rate": 8.9e-07, + "loss": -0.0028424898628145456, + "reward": 1.9287346601486206, + "reward_std": 0.36689065396785736, + "rewards/GDino": 0.7782090902328491, + "rewards/GIT": 0.4271218478679657, + "rewards/HPSv2": 0.262115478515625, + "rewards/ORM": 0.461288183927536, + "self_certainty_semantic": -25.625, + "self_certainty_token": -20.75, + "step": 176 + }, + { + "completion_length": 74.65625, + "epoch": 0.19601328903654486, + "grad_norm": 0.5553709864616394, + "kl": 0.014068603515625, + "learning_rate": 8.89375e-07, + "loss": -0.00260241178330034, + "reward": 2.277731418609619, + "reward_std": 0.36928629875183105, + "rewards/GDino": 0.7465280592441559, + "rewards/GIT": 0.4939851015806198, + "rewards/HPSv2": 0.2715930938720703, + "rewards/ORM": 0.765625, + "self_certainty_semantic": -25.875, + "self_certainty_token": -21.25, + "step": 177 + }, + { + "completion_length": 77.53125, + "epoch": 0.19712070874861573, + "grad_norm": 0.812800407409668, + "kl": 0.0077972412109375, + "learning_rate": 8.8875e-07, + "loss": -0.007587546017020941, + "reward": 2.0915766954421997, + "reward_std": 0.39137691259384155, + "rewards/GDino": 0.745751827955246, + "rewards/GIT": 0.40190117061138153, + "rewards/HPSv2": 0.2661113739013672, + "rewards/ORM": 0.6778122782707214, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.25, + "step": 178 + }, + { + "completion_length": 64.765625, + "epoch": 0.1982281284606866, + "grad_norm": 0.8705865740776062, + "kl": 0.01080322265625, + "learning_rate": 8.88125e-07, + "loss": -0.00909736379981041, + "reward": 2.4661701917648315, + "reward_std": 0.1972077488899231, + "rewards/GDino": 0.8959279954433441, + "rewards/GIT": 0.5798787474632263, + "rewards/HPSv2": 0.2825050354003906, + "rewards/ORM": 0.7078584432601929, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.4375, + "step": 179 + }, + { + "completion_length": 75.1875, + "epoch": 0.19933554817275748, + "grad_norm": 1.3513967990875244, + "kl": 0.0105743408203125, + "learning_rate": 8.874999999999999e-07, + "loss": 0.023300296626985073, + "reward": 1.805686593055725, + "reward_std": 0.4569002389907837, + "rewards/GDino": 0.748356282711029, + "rewards/GIT": 0.34142881631851196, + "rewards/HPSv2": 0.2596473693847656, + "rewards/ORM": 0.4562540054321289, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.5625, + "step": 180 + }, + { + "completion_length": 74.171875, + "epoch": 0.20044296788482835, + "grad_norm": 0.49861499667167664, + "kl": 0.00799560546875, + "learning_rate": 8.86875e-07, + "loss": 0.005896527087315917, + "reward": 1.8344124555587769, + "reward_std": 0.33161167800426483, + "rewards/GDino": 0.6484833061695099, + "rewards/GIT": 0.3188634589314461, + "rewards/HPSv2": 0.2792530059814453, + "rewards/ORM": 0.587812751531601, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.75, + "step": 181 + }, + { + "completion_length": 66.53125, + "epoch": 0.20155038759689922, + "grad_norm": 0.518588125705719, + "kl": 0.021148681640625, + "learning_rate": 8.8625e-07, + "loss": -0.0032154046930372715, + "reward": 1.6775782704353333, + "reward_std": 0.4542950987815857, + "rewards/GDino": 0.6909389793872833, + "rewards/GIT": 0.31735002249479294, + "rewards/HPSv2": 0.27741050720214844, + "rewards/ORM": 0.39187873899936676, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.0625, + "step": 182 + }, + { + "completion_length": 83.171875, + "epoch": 0.2026578073089701, + "grad_norm": 0.4635794758796692, + "kl": 0.015838623046875, + "learning_rate": 8.85625e-07, + "loss": 0.006844737799838185, + "reward": 1.8692994713783264, + "reward_std": 0.3296326994895935, + "rewards/GDino": 0.7293833494186401, + "rewards/GIT": 0.34990622848272324, + "rewards/HPSv2": 0.2678260803222656, + "rewards/ORM": 0.5221837162971497, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.4375, + "step": 183 + }, + { + "completion_length": 63.4375, + "epoch": 0.20376522702104097, + "grad_norm": 0.5085333585739136, + "kl": 0.0120849609375, + "learning_rate": 8.85e-07, + "loss": -0.0026784827932715416, + "reward": 2.799358606338501, + "reward_std": 0.1885242909193039, + "rewards/GDino": 0.925000011920929, + "rewards/GIT": 0.7545149028301239, + "rewards/HPSv2": 0.26367759704589844, + "rewards/ORM": 0.8561660945415497, + "self_certainty_semantic": -25.5, + "self_certainty_token": -21.1875, + "step": 184 + }, + { + "completion_length": 65.859375, + "epoch": 0.20487264673311184, + "grad_norm": 0.5494704842567444, + "kl": 0.013671875, + "learning_rate": 8.84375e-07, + "loss": -0.003346539626363665, + "reward": 2.0845471620559692, + "reward_std": 0.5152666121721268, + "rewards/GDino": 0.7945332229137421, + "rewards/GIT": 0.2876994013786316, + "rewards/HPSv2": 0.27262306213378906, + "rewards/ORM": 0.7296914756298065, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -22.125, + "step": 185 + }, + { + "completion_length": 71.6875, + "epoch": 0.2059800664451827, + "grad_norm": 0.5301854014396667, + "kl": 0.011871337890625, + "learning_rate": 8.8375e-07, + "loss": -0.0013000170001760125, + "reward": 2.0686882734298706, + "reward_std": 0.40786902606487274, + "rewards/GDino": 0.6654029488563538, + "rewards/GIT": 0.3254973590373993, + "rewards/HPSv2": 0.240997314453125, + "rewards/ORM": 0.8367905914783478, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.0625, + "step": 186 + }, + { + "completion_length": 76.890625, + "epoch": 0.2070874861572536, + "grad_norm": 0.4597737789154053, + "kl": 0.011993408203125, + "learning_rate": 8.83125e-07, + "loss": 0.016351854777894914, + "reward": 2.200950801372528, + "reward_std": 0.35277409851551056, + "rewards/GDino": 0.7939618229866028, + "rewards/GIT": 0.5313694775104523, + "rewards/HPSv2": 0.26030731201171875, + "rewards/ORM": 0.615312248468399, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.25, + "step": 187 + }, + { + "completion_length": 65.65625, + "epoch": 0.2081949058693245, + "grad_norm": 0.5319734811782837, + "kl": 0.010162353515625, + "learning_rate": 8.824999999999999e-07, + "loss": 0.00020685815252363682, + "reward": 2.099229574203491, + "reward_std": 0.360196590423584, + "rewards/GDino": 0.7534400224685669, + "rewards/GIT": 0.27092792093753815, + "rewards/HPSv2": 0.2623615264892578, + "rewards/ORM": 0.8125, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.625, + "step": 188 + }, + { + "completion_length": 71.703125, + "epoch": 0.20930232558139536, + "grad_norm": 0.7321242690086365, + "kl": 0.0094451904296875, + "learning_rate": 8.818749999999999e-07, + "loss": -0.004028161056339741, + "reward": 2.337135910987854, + "reward_std": 0.31387007236480713, + "rewards/GDino": 0.7773648500442505, + "rewards/GIT": 0.5682414174079895, + "rewards/HPSv2": 0.27951812744140625, + "rewards/ORM": 0.7120114862918854, + "self_certainty_semantic": -25.75, + "self_certainty_token": -21.25, + "step": 189 + }, + { + "completion_length": 70.203125, + "epoch": 0.21040974529346623, + "grad_norm": 1.9930344820022583, + "kl": 0.0136566162109375, + "learning_rate": 8.812499999999999e-07, + "loss": 0.008943180087953806, + "reward": 2.5060739517211914, + "reward_std": 0.16241375356912613, + "rewards/GDino": 0.9254540205001831, + "rewards/GIT": 0.454538494348526, + "rewards/HPSv2": 0.2667064666748047, + "rewards/ORM": 0.859375, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.75, + "step": 190 + }, + { + "completion_length": 64.796875, + "epoch": 0.2115171650055371, + "grad_norm": 0.4348452091217041, + "kl": 0.007415771484375, + "learning_rate": 8.806249999999999e-07, + "loss": -0.006945850793272257, + "reward": 2.5402393341064453, + "reward_std": 0.2529807686805725, + "rewards/GDino": 0.8751335144042969, + "rewards/GIT": 0.6033133119344711, + "rewards/HPSv2": 0.27858734130859375, + "rewards/ORM": 0.7832051813602448, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.5625, + "step": 191 + }, + { + "completion_length": 89.921875, + "epoch": 0.21262458471760798, + "grad_norm": 0.7680485248565674, + "kl": 0.012481689453125, + "learning_rate": 8.799999999999999e-07, + "loss": 0.005377613822929561, + "reward": 1.8802450299263, + "reward_std": 0.3106888607144356, + "rewards/GDino": 0.6456713378429413, + "rewards/GIT": 0.4135439097881317, + "rewards/HPSv2": 0.2503166198730469, + "rewards/ORM": 0.5707131326198578, + "self_certainty_semantic": -25.5625, + "self_certainty_token": -21.125, + "step": 192 + }, + { + "completion_length": 69.78125, + "epoch": 0.21373200442967885, + "grad_norm": 0.5264883637428284, + "kl": 0.010955810546875, + "learning_rate": 8.793749999999999e-07, + "loss": 0.008317717118188739, + "reward": 1.861718237400055, + "reward_std": 0.4164891242980957, + "rewards/GDino": 0.7109375596046448, + "rewards/GIT": 0.21486494690179825, + "rewards/HPSv2": 0.2839984893798828, + "rewards/ORM": 0.6519171893596649, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -22.1875, + "step": 193 + }, + { + "completion_length": 74.75, + "epoch": 0.21483942414174972, + "grad_norm": 0.5414590835571289, + "kl": 0.0074462890625, + "learning_rate": 8.7875e-07, + "loss": -0.0021489104256033897, + "reward": 1.963248074054718, + "reward_std": 0.4292799085378647, + "rewards/GDino": 0.8057583570480347, + "rewards/GIT": 0.5115346312522888, + "rewards/HPSv2": 0.26822662353515625, + "rewards/ORM": 0.37772834300994873, + "self_certainty_semantic": -25.8125, + "self_certainty_token": -21.25, + "step": 194 + }, + { + "completion_length": 68.203125, + "epoch": 0.2159468438538206, + "grad_norm": 0.45540449023246765, + "kl": 0.01312255859375, + "learning_rate": 8.78125e-07, + "loss": -0.004703107755631208, + "reward": 2.011273205280304, + "reward_std": 0.4216621667146683, + "rewards/GDino": 0.7242187261581421, + "rewards/GIT": 0.5994383990764618, + "rewards/HPSv2": 0.27542877197265625, + "rewards/ORM": 0.41218727827072144, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 195 + }, + { + "completion_length": 79.5, + "epoch": 0.21705426356589147, + "grad_norm": 0.5480747818946838, + "kl": 0.007293701171875, + "learning_rate": 8.774999999999999e-07, + "loss": -0.001077285036444664, + "reward": 2.287221312522888, + "reward_std": 0.3154482841491699, + "rewards/GDino": 0.7235225439071655, + "rewards/GIT": 0.5517593622207642, + "rewards/HPSv2": 0.2792186737060547, + "rewards/ORM": 0.7327205836772919, + "self_certainty_semantic": -25.6875, + "self_certainty_token": -21.6875, + "step": 196 + }, + { + "completion_length": 67.34375, + "epoch": 0.21816168327796234, + "grad_norm": 0.648148238658905, + "kl": 0.01416015625, + "learning_rate": 8.76875e-07, + "loss": 0.0010744737228378654, + "reward": 2.3249343037605286, + "reward_std": 0.40621738135814667, + "rewards/GDino": 0.7385416626930237, + "rewards/GIT": 0.4809828922152519, + "rewards/HPSv2": 0.2538471221923828, + "rewards/ORM": 0.8515625, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.625, + "step": 197 + }, + { + "completion_length": 74.59375, + "epoch": 0.21926910299003322, + "grad_norm": 0.978819727897644, + "kl": 0.01177978515625, + "learning_rate": 8.7625e-07, + "loss": 0.004215072840452194, + "reward": 2.1429388523101807, + "reward_std": 0.3008539155125618, + "rewards/GDino": 0.8473958671092987, + "rewards/GIT": 0.5675143599510193, + "rewards/HPSv2": 0.2627582550048828, + "rewards/ORM": 0.4652703106403351, + "self_certainty_semantic": -25.625, + "self_certainty_token": -21.4375, + "step": 198 + }, + { + "completion_length": 65.515625, + "epoch": 0.2203765227021041, + "grad_norm": 0.6454822421073914, + "kl": 0.01220703125, + "learning_rate": 8.75625e-07, + "loss": -0.0005628032376989722, + "reward": 2.50363028049469, + "reward_std": 0.3133077025413513, + "rewards/GDino": 0.8082683682441711, + "rewards/GIT": 0.6633397042751312, + "rewards/HPSv2": 0.2600593566894531, + "rewards/ORM": 0.7719629406929016, + "self_certainty_semantic": -25.5, + "self_certainty_token": -22.0, + "step": 199 + }, + { + "completion_length": 61.59375, + "epoch": 0.22148394241417496, + "grad_norm": 0.6677749156951904, + "kl": 0.0155029296875, + "learning_rate": 8.75e-07, + "loss": 0.0032004087697714567, + "reward": 2.0826478004455566, + "reward_std": 0.48166391253471375, + "rewards/GDino": 0.7572438716888428, + "rewards/GIT": 0.2937658578157425, + "rewards/HPSv2": 0.27109718322753906, + "rewards/ORM": 0.7605408430099487, + "self_certainty_semantic": -25.75, + "self_certainty_token": -20.5625, + "step": 200 + } + ], + "logging_steps": 1.0, + "max_steps": 1600, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}