elsvastika commited on
Commit
fd7b7b0
·
verified ·
1 Parent(s): aeef55a

End of training

Browse files
adapter_config.json CHANGED
@@ -24,13 +24,13 @@
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
- "k_proj",
28
- "gate_proj",
29
- "down_proj",
30
  "o_proj",
31
  "q_proj",
32
- "v_proj",
33
- "up_proj"
 
 
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
 
24
  "rank_pattern": {},
25
  "revision": null,
26
  "target_modules": [
27
+ "v_proj",
 
 
28
  "o_proj",
29
  "q_proj",
30
+ "gate_proj",
31
+ "up_proj",
32
+ "k_proj",
33
+ "down_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ead40382b37ef28866c5216b320811ecf7182ed340ceccf63d8e53db5fc12a0d
3
  size 35237104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c946f7d79d87a58ad003239290ea9ef4833a06139b16c006a7e622a4004f1aa4
3
  size 35237104
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.602725280667073e-06,
4
- "train_runtime": 1884.3714,
5
- "train_samples": 36,
6
- "train_samples_per_second": 0.425,
7
- "train_steps_per_second": 0.027
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 3.2592993572203946e-06,
4
+ "train_runtime": 1944.3016,
5
+ "train_samples": 24,
6
+ "train_samples_per_second": 0.247,
7
+ "train_steps_per_second": 0.015
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.602725280667073e-06,
4
- "train_runtime": 1884.3714,
5
- "train_samples": 36,
6
- "train_samples_per_second": 0.425,
7
- "train_steps_per_second": 0.027
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 3.2592993572203946e-06,
4
+ "train_runtime": 1944.3016,
5
+ "train_samples": 24,
6
+ "train_samples_per_second": 0.247,
7
+ "train_steps_per_second": 0.015
8
  }
trainer_state.json CHANGED
@@ -1,502 +1,312 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.555555555555555,
5
  "eval_steps": 500,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 254.0,
13
- "epoch": 0.2222222222222222,
14
- "grad_norm": 2.455533266067505,
15
- "kl": 0.0016071582067525014,
16
- "learning_rate": 5e-07,
17
  "loss": 0.0,
18
- "reward": 0.8821750227361917,
19
- "reward_std": 0.7979278466664255,
20
- "rewards/concensus_correctness_reward_func": 0.049937501549720764,
21
- "rewards/consensus_reward_func": 0.0625,
22
  "rewards/cumulative_reward_2": 0.0,
23
  "rewards/final_correctness_reward_func": 0.3125,
24
- "rewards/question_recreation_reward_func": 0.26773752458393574,
25
- "rewards/soft_format_reward_func": 0.0,
26
  "rewards/strict_format_reward_func": 0.0,
27
- "rewards/xmlcount_reward_func": 0.1895000054500997,
28
  "step": 2
29
  },
30
  {
31
- "completion_length": 369.3125,
32
- "epoch": 0.4444444444444444,
33
- "grad_norm": 2.520486354827881,
34
- "kl": 0.0038820735207991675,
35
- "learning_rate": 4.978612153434526e-07,
36
  "loss": 0.0,
37
- "reward": 0.3699257434345782,
38
- "reward_std": 0.5709783472120762,
39
- "rewards/concensus_correctness_reward_func": 0.0,
40
  "rewards/consensus_reward_func": 0.0,
41
  "rewards/cumulative_reward_2": 0.0,
42
  "rewards/final_correctness_reward_func": 0.0,
43
- "rewards/question_recreation_reward_func": 0.19958198571112007,
44
  "rewards/soft_format_reward_func": 0.0,
45
  "rewards/strict_format_reward_func": 0.0,
46
- "rewards/xmlcount_reward_func": 0.17034375481307507,
47
  "step": 4
48
  },
49
  {
50
- "completion_length": 467.875,
51
- "epoch": 0.6666666666666666,
52
- "grad_norm": 1.3698705434799194,
53
- "kl": 0.0013275225574034266,
54
- "learning_rate": 4.91481456572267e-07,
55
  "loss": 0.0,
56
- "reward": 0.6990349255502224,
57
- "reward_std": 0.9675047248601913,
58
- "rewards/concensus_correctness_reward_func": 0.046875,
59
- "rewards/consensus_reward_func": 0.0625,
60
- "rewards/cumulative_reward_2": 0.0,
61
- "rewards/final_correctness_reward_func": 0.0625,
62
- "rewards/question_recreation_reward_func": 0.33225369080901146,
63
- "rewards/soft_format_reward_func": 0.0,
64
- "rewards/strict_format_reward_func": 0.0,
65
- "rewards/xmlcount_reward_func": 0.19490624405443668,
66
- "step": 6
67
- },
68
- {
69
- "completion_length": 323.34375,
70
- "epoch": 0.8888888888888888,
71
- "grad_norm": 2.7056479454040527,
72
- "kl": 0.00173920994711807,
73
- "learning_rate": 4.809698831278217e-07,
74
- "loss": 0.0,
75
- "reward": 0.5385207324288785,
76
- "reward_std": 0.8355418732389808,
77
- "rewards/concensus_correctness_reward_func": 0.02787500061094761,
78
  "rewards/consensus_reward_func": 0.0,
79
  "rewards/cumulative_reward_2": 0.0,
80
  "rewards/final_correctness_reward_func": 0.125,
81
- "rewards/question_recreation_reward_func": 0.2965519982390106,
82
- "rewards/soft_format_reward_func": 0.015625,
83
- "rewards/strict_format_reward_func": 0.0,
84
- "rewards/xmlcount_reward_func": 0.07346874848008156,
85
- "step": 8
86
- },
87
- {
88
- "completion_length": 276.09375,
89
- "epoch": 1.1111111111111112,
90
- "grad_norm": 3.8186511993408203,
91
- "kl": 0.0028932989152963273,
92
- "learning_rate": 4.6650635094610966e-07,
93
- "loss": 0.0,
94
- "reward": 1.1083624437451363,
95
- "reward_std": 1.7071605939418077,
96
- "rewards/concensus_correctness_reward_func": 0.625,
97
- "rewards/consensus_reward_func": 0.0,
98
- "rewards/cumulative_reward_2": 0.0,
99
- "rewards/final_correctness_reward_func": 0.1875,
100
- "rewards/question_recreation_reward_func": 0.25786245451308787,
101
  "rewards/soft_format_reward_func": 0.0,
102
  "rewards/strict_format_reward_func": 0.0,
103
- "rewards/xmlcount_reward_func": 0.03799999970942736,
104
- "step": 10
105
  },
106
  {
107
- "completion_length": 255.59375,
108
  "epoch": 1.3333333333333333,
109
- "grad_norm": 34.07455062866211,
110
- "kl": 0.0037407633935799822,
111
- "learning_rate": 4.483383350728088e-07,
112
- "loss": 0.0,
113
- "reward": 1.2880361340939999,
114
- "reward_std": 1.625240983441472,
115
- "rewards/concensus_correctness_reward_func": 0.625,
116
- "rewards/consensus_reward_func": 0.0625,
117
- "rewards/cumulative_reward_2": 0.0,
118
- "rewards/final_correctness_reward_func": 0.125,
119
- "rewards/question_recreation_reward_func": 0.32416112907230854,
120
- "rewards/soft_format_reward_func": 0.0,
121
- "rewards/strict_format_reward_func": 0.0,
122
- "rewards/xmlcount_reward_func": 0.15137499663978815,
123
- "step": 12
124
- },
125
- {
126
- "completion_length": 285.21875,
127
- "epoch": 1.5555555555555556,
128
- "grad_norm": 3.052701234817505,
129
- "kl": 0.002863182016881183,
130
- "learning_rate": 4.2677669529663686e-07,
131
  "loss": 0.0,
132
- "reward": 0.607268082909286,
133
- "reward_std": 0.6088872440159321,
134
- "rewards/concensus_correctness_reward_func": 0.022187499329447746,
135
- "rewards/consensus_reward_func": 0.0,
136
  "rewards/cumulative_reward_2": 0.0,
137
- "rewards/final_correctness_reward_func": 0.0,
138
- "rewards/question_recreation_reward_func": 0.38814307004213333,
139
  "rewards/soft_format_reward_func": 0.0,
140
  "rewards/strict_format_reward_func": 0.0,
141
- "rewards/xmlcount_reward_func": 0.19693750143051147,
142
- "step": 14
143
  },
144
  {
145
- "completion_length": 347.65625,
146
- "epoch": 1.7777777777777777,
147
- "grad_norm": 5.15467643737793,
148
- "kl": 0.0014800850040046498,
149
- "learning_rate": 4.0219035725218013e-07,
150
  "loss": 0.0,
151
- "reward": 0.1809218251146376,
152
- "reward_std": 0.7999278882052749,
153
- "rewards/concensus_correctness_reward_func": 0.0,
154
  "rewards/consensus_reward_func": 0.0,
155
  "rewards/cumulative_reward_2": 0.0,
156
  "rewards/final_correctness_reward_func": 0.0625,
157
- "rewards/question_recreation_reward_func": 0.2310780775733292,
158
- "rewards/soft_format_reward_func": 0.0,
159
- "rewards/strict_format_reward_func": 0.0,
160
- "rewards/xmlcount_reward_func": -0.112656245008111,
161
- "step": 16
162
  },
163
  {
164
- "completion_length": 441.25,
165
  "epoch": 2.0,
166
- "grad_norm": 4.815533638000488,
167
- "kl": 0.0013374313130043447,
168
- "learning_rate": 3.75e-07,
169
  "loss": 0.0,
170
- "reward": 0.5386425573378801,
171
- "reward_std": 0.6832009451463819,
172
- "rewards/concensus_correctness_reward_func": 0.046875,
173
- "rewards/consensus_reward_func": 0.0625,
174
- "rewards/cumulative_reward_2": 0.0,
175
- "rewards/final_correctness_reward_func": 0.0,
176
- "rewards/question_recreation_reward_func": 0.23683004680788144,
177
- "rewards/soft_format_reward_func": 0.0,
178
- "rewards/strict_format_reward_func": 0.0,
179
- "rewards/xmlcount_reward_func": 0.19243750348687172,
180
- "step": 18
181
- },
182
- {
183
- "completion_length": 437.25,
184
- "epoch": 2.2222222222222223,
185
- "grad_norm": 2.5218758583068848,
186
- "kl": 0.0015388188403449021,
187
- "learning_rate": 3.4567085809127245e-07,
188
- "loss": 0.0,
189
- "reward": 0.7281985133886337,
190
- "reward_std": 0.6127606704831123,
191
  "rewards/concensus_correctness_reward_func": 0.0,
192
  "rewards/consensus_reward_func": 0.0,
193
  "rewards/cumulative_reward_2": 0.0,
194
- "rewards/final_correctness_reward_func": 0.0,
195
- "rewards/question_recreation_reward_func": 0.45726102218031883,
196
  "rewards/soft_format_reward_func": 0.0,
197
  "rewards/strict_format_reward_func": 0.0,
198
- "rewards/xmlcount_reward_func": 0.2709374986588955,
199
- "step": 20
200
  },
201
  {
202
- "completion_length": 387.75,
203
- "epoch": 2.4444444444444446,
204
- "grad_norm": 1.3259650468826294,
205
- "kl": 0.0036219921821611933,
206
- "learning_rate": 3.147047612756302e-07,
207
  "loss": 0.0,
208
- "reward": 0.2731318287551403,
209
- "reward_std": 0.2530765999108553,
210
- "rewards/concensus_correctness_reward_func": 0.0,
211
  "rewards/consensus_reward_func": 0.0,
212
  "rewards/cumulative_reward_2": 0.0,
213
- "rewards/final_correctness_reward_func": 0.0,
214
- "rewards/question_recreation_reward_func": 0.20441308384761214,
215
  "rewards/soft_format_reward_func": 0.0,
216
  "rewards/strict_format_reward_func": 0.0,
217
- "rewards/xmlcount_reward_func": 0.06871875282377005,
218
- "step": 22
219
  },
220
  {
221
- "completion_length": 447.09375,
222
  "epoch": 2.6666666666666665,
223
- "grad_norm": 6.329639434814453,
224
- "kl": 0.0021589112002402544,
225
- "learning_rate": 2.826315480550129e-07,
226
  "loss": 0.0,
227
- "reward": 1.2416966576129198,
228
- "reward_std": 2.0539041608572006,
229
- "rewards/concensus_correctness_reward_func": 0.625,
230
- "rewards/consensus_reward_func": 0.0,
231
  "rewards/cumulative_reward_2": 0.0,
232
- "rewards/final_correctness_reward_func": 0.1875,
233
- "rewards/question_recreation_reward_func": 0.3013216257095337,
234
  "rewards/soft_format_reward_func": 0.0,
235
  "rewards/strict_format_reward_func": 0.0,
236
- "rewards/xmlcount_reward_func": 0.1278750030323863,
237
- "step": 24
238
  },
239
  {
240
- "completion_length": 301.875,
241
- "epoch": 2.888888888888889,
242
- "grad_norm": 4.056438446044922,
243
- "kl": 0.0024828215537127107,
244
- "learning_rate": 2.5e-07,
245
  "loss": 0.0,
246
- "reward": 0.2829342377372086,
247
- "reward_std": 0.33365900977514684,
248
- "rewards/concensus_correctness_reward_func": 0.0,
249
  "rewards/consensus_reward_func": 0.0,
250
  "rewards/cumulative_reward_2": 0.0,
251
- "rewards/final_correctness_reward_func": 0.0,
252
- "rewards/question_recreation_reward_func": 0.1184342410415411,
253
- "rewards/soft_format_reward_func": 0.0,
254
- "rewards/strict_format_reward_func": 0.0,
255
- "rewards/xmlcount_reward_func": 0.16449999809265137,
256
- "step": 26
257
- },
258
- {
259
- "completion_length": 341.15625,
260
- "epoch": 3.111111111111111,
261
- "grad_norm": 2.550891876220703,
262
- "kl": 0.002100169222103432,
263
- "learning_rate": 2.1736845194498716e-07,
264
- "loss": 0.0,
265
- "reward": 0.3925126292742789,
266
- "reward_std": 0.7323208572342992,
267
- "rewards/concensus_correctness_reward_func": 0.014562499709427357,
268
- "rewards/consensus_reward_func": 0.0625,
269
- "rewards/cumulative_reward_2": 0.0,
270
- "rewards/final_correctness_reward_func": 0.0,
271
- "rewards/question_recreation_reward_func": 0.29879387514665723,
272
  "rewards/soft_format_reward_func": 0.0,
273
  "rewards/strict_format_reward_func": 0.0,
274
- "rewards/xmlcount_reward_func": 0.016656249528750777,
275
- "step": 28
276
  },
277
  {
278
- "completion_length": 388.03125,
279
  "epoch": 3.3333333333333335,
280
- "grad_norm": 1.4323590993881226,
281
- "kl": 0.004696541313023772,
282
- "learning_rate": 1.8529523872436977e-07,
283
  "loss": 0.0,
284
- "reward": 0.05710854474455118,
285
- "reward_std": 0.5547531270422041,
286
- "rewards/concensus_correctness_reward_func": 0.0078125,
287
  "rewards/consensus_reward_func": 0.0625,
288
  "rewards/cumulative_reward_2": 0.0,
289
  "rewards/final_correctness_reward_func": 0.0,
290
- "rewards/question_recreation_reward_func": 0.19385854969732463,
291
  "rewards/soft_format_reward_func": 0.0,
292
  "rewards/strict_format_reward_func": 0.0,
293
- "rewards/xmlcount_reward_func": -0.20706249913200736,
294
- "step": 30
295
  },
296
  {
297
- "completion_length": 352.90625,
298
- "epoch": 3.5555555555555554,
299
- "grad_norm": 2.221634864807129,
300
- "kl": 0.002907156704168301,
301
- "learning_rate": 1.5432914190872756e-07,
302
  "loss": 0.0,
303
- "reward": 0.3871938968077302,
304
- "reward_std": 0.3934658619109541,
305
- "rewards/concensus_correctness_reward_func": 0.0,
306
- "rewards/consensus_reward_func": 0.0,
307
  "rewards/cumulative_reward_2": 0.0,
308
  "rewards/final_correctness_reward_func": 0.0,
309
- "rewards/question_recreation_reward_func": 0.2629126524552703,
310
- "rewards/soft_format_reward_func": 0.0,
311
- "rewards/strict_format_reward_func": 0.0,
312
- "rewards/xmlcount_reward_func": 0.1242812555283308,
313
- "step": 32
314
- },
315
- {
316
- "completion_length": 384.375,
317
- "epoch": 3.7777777777777777,
318
- "grad_norm": 2.7794787883758545,
319
- "kl": 0.001867200349806808,
320
- "learning_rate": 1.2500000000000005e-07,
321
- "loss": 0.0,
322
- "reward": 0.9573859982192516,
323
- "reward_std": 1.0359943909570575,
324
- "rewards/concensus_correctness_reward_func": 0.11443750280886889,
325
- "rewards/consensus_reward_func": 0.1875,
326
- "rewards/cumulative_reward_2": 0.0,
327
- "rewards/final_correctness_reward_func": 0.0625,
328
- "rewards/question_recreation_reward_func": 0.3830109708942473,
329
  "rewards/soft_format_reward_func": 0.0,
330
  "rewards/strict_format_reward_func": 0.0,
331
- "rewards/xmlcount_reward_func": 0.20993749890476465,
332
- "step": 34
333
  },
334
  {
335
- "completion_length": 361.375,
336
  "epoch": 4.0,
337
- "grad_norm": 3.378568172454834,
338
- "kl": 0.0019658175588119775,
339
- "learning_rate": 9.780964274781983e-08,
340
- "loss": 0.0,
341
- "reward": 0.550561910495162,
342
- "reward_std": 0.5597149441018701,
343
- "rewards/concensus_correctness_reward_func": 0.03125,
344
- "rewards/consensus_reward_func": 0.0,
345
- "rewards/cumulative_reward_2": 0.0,
346
- "rewards/final_correctness_reward_func": 0.125,
347
- "rewards/question_recreation_reward_func": 0.27071817917749286,
348
- "rewards/soft_format_reward_func": 0.0,
349
- "rewards/strict_format_reward_func": 0.0,
350
- "rewards/xmlcount_reward_func": 0.12359375134110451,
351
- "step": 36
352
- },
353
- {
354
- "completion_length": 282.96875,
355
- "epoch": 4.222222222222222,
356
- "grad_norm": 4.209115505218506,
357
- "kl": 0.0013284565284266137,
358
- "learning_rate": 7.322330470336313e-08,
359
  "loss": 0.0,
360
- "reward": 1.8079969780519605,
361
- "reward_std": 2.005708161741495,
362
- "rewards/concensus_correctness_reward_func": 1.268062500283122,
363
  "rewards/consensus_reward_func": 0.0,
364
  "rewards/cumulative_reward_2": 0.0,
365
  "rewards/final_correctness_reward_func": 0.25,
366
- "rewards/question_recreation_reward_func": 0.20130947075085714,
367
  "rewards/soft_format_reward_func": 0.0,
368
  "rewards/strict_format_reward_func": 0.0,
369
- "rewards/xmlcount_reward_func": 0.08862500544637442,
370
- "step": 38
371
  },
372
  {
373
- "completion_length": 431.4375,
374
- "epoch": 4.444444444444445,
375
- "grad_norm": 29.043092727661133,
376
- "kl": 0.003590297739719972,
377
- "learning_rate": 5.166166492719124e-08,
378
  "loss": 0.0,
379
- "reward": 0.5267084827646613,
380
- "reward_std": 0.684504278935492,
381
  "rewards/concensus_correctness_reward_func": 0.0,
382
- "rewards/consensus_reward_func": 0.0,
383
- "rewards/cumulative_reward_2": 0.0,
384
- "rewards/final_correctness_reward_func": 0.0625,
385
- "rewards/question_recreation_reward_func": 0.35548973828554153,
386
- "rewards/soft_format_reward_func": 0.0,
387
- "rewards/strict_format_reward_func": 0.0,
388
- "rewards/xmlcount_reward_func": 0.10871875286102295,
389
- "step": 40
390
- },
391
- {
392
- "completion_length": 384.59375,
393
- "epoch": 4.666666666666667,
394
- "grad_norm": 29.70347785949707,
395
- "kl": 0.001960429195605684,
396
- "learning_rate": 3.349364905389032e-08,
397
- "loss": 0.0,
398
- "reward": 1.234227987471968,
399
- "reward_std": 1.9329941319301724,
400
- "rewards/concensus_correctness_reward_func": 0.6629374995827675,
401
  "rewards/consensus_reward_func": 0.0625,
402
  "rewards/cumulative_reward_2": 0.0,
403
- "rewards/final_correctness_reward_func": 0.125,
404
- "rewards/question_recreation_reward_func": 0.257196772727184,
405
- "rewards/soft_format_reward_func": 0.0,
406
- "rewards/strict_format_reward_func": 0.0,
407
- "rewards/xmlcount_reward_func": 0.12659375229850411,
408
- "step": 42
409
- },
410
- {
411
- "completion_length": 364.0625,
412
- "epoch": 4.888888888888889,
413
- "grad_norm": 55.00151062011719,
414
- "kl": 0.002629825277836062,
415
- "learning_rate": 1.9030116872178314e-08,
416
- "loss": 0.0,
417
- "reward": 0.5272765271365643,
418
- "reward_std": 0.8430194035172462,
419
- "rewards/concensus_correctness_reward_func": 0.01837499998509884,
420
- "rewards/consensus_reward_func": 0.0,
421
- "rewards/cumulative_reward_2": 0.0,
422
  "rewards/final_correctness_reward_func": 0.0625,
423
- "rewards/question_recreation_reward_func": 0.31580778677016497,
424
  "rewards/soft_format_reward_func": 0.0,
425
  "rewards/strict_format_reward_func": 0.0,
426
- "rewards/xmlcount_reward_func": 0.13059375621378422,
427
- "step": 44
428
  },
429
  {
430
- "completion_length": 304.40625,
431
- "epoch": 5.111111111111111,
432
- "grad_norm": 42.15534973144531,
433
- "kl": 0.005508741844096221,
434
- "learning_rate": 8.518543427732949e-09,
435
  "loss": 0.0,
436
- "reward": 0.4785704296082258,
437
- "reward_std": 0.8159382930025458,
438
- "rewards/concensus_correctness_reward_func": 0.01837499998509884,
439
- "rewards/consensus_reward_func": 0.0,
440
  "rewards/cumulative_reward_2": 0.0,
441
  "rewards/final_correctness_reward_func": 0.0625,
442
- "rewards/question_recreation_reward_func": 0.33713291585445404,
443
- "rewards/soft_format_reward_func": 0.0,
444
- "rewards/strict_format_reward_func": 0.0,
445
- "rewards/xmlcount_reward_func": 0.06056249886751175,
446
- "step": 46
447
- },
448
- {
449
- "completion_length": 353.375,
450
- "epoch": 5.333333333333333,
451
- "grad_norm": 6.933172225952148,
452
- "kl": 0.002127421241311822,
453
- "learning_rate": 2.1387846565474044e-09,
454
- "loss": 0.0,
455
- "reward": 0.6708917245268822,
456
- "reward_std": 0.9302016571164131,
457
- "rewards/concensus_correctness_reward_func": 0.006812499836087227,
458
- "rewards/consensus_reward_func": 0.0,
459
- "rewards/cumulative_reward_2": 0.0,
460
- "rewards/final_correctness_reward_func": 0.3125,
461
- "rewards/question_recreation_reward_func": 0.2251729667186737,
462
  "rewards/soft_format_reward_func": 0.0,
463
  "rewards/strict_format_reward_func": 0.0,
464
- "rewards/xmlcount_reward_func": 0.1264062523841858,
465
- "step": 48
466
  },
467
  {
468
- "completion_length": 346.9375,
469
- "epoch": 5.555555555555555,
470
- "grad_norm": 7.681344985961914,
471
- "kl": 0.0038058556092437357,
472
  "learning_rate": 0.0,
473
  "loss": 0.0,
474
- "reward": 0.41686041094362736,
475
- "reward_std": 0.7147915656678379,
476
  "rewards/concensus_correctness_reward_func": 0.0,
477
  "rewards/consensus_reward_func": 0.0,
478
  "rewards/cumulative_reward_2": 0.0,
479
- "rewards/final_correctness_reward_func": 0.0,
480
- "rewards/question_recreation_reward_func": 0.2795791446696967,
481
  "rewards/soft_format_reward_func": 0.0,
482
  "rewards/strict_format_reward_func": 0.0,
483
- "rewards/xmlcount_reward_func": 0.13728125765919685,
484
- "step": 50
485
  },
486
  {
487
- "epoch": 5.555555555555555,
488
- "step": 50,
489
  "total_flos": 0.0,
490
- "train_loss": 2.602725280667073e-06,
491
- "train_runtime": 1884.3714,
492
- "train_samples_per_second": 0.425,
493
- "train_steps_per_second": 0.027
494
  }
495
  ],
496
  "logging_steps": 2,
497
- "max_steps": 50,
498
  "num_input_tokens_seen": 0,
499
- "num_train_epochs": 6,
500
  "save_steps": 25,
501
  "stateful_callbacks": {
502
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 500,
6
+ "global_step": 30,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 311.9375,
13
+ "epoch": 0.3333333333333333,
14
+ "grad_norm": 2.2533836364746094,
15
+ "kl": 0.0024804054301057477,
16
+ "learning_rate": 4.985344892885899e-07,
17
  "loss": 0.0,
18
+ "reward": 0.6862528233905323,
19
+ "reward_std": 0.8880199125269428,
20
+ "rewards/concensus_correctness_reward_func": 0.0,
21
+ "rewards/consensus_reward_func": 0.0,
22
  "rewards/cumulative_reward_2": 0.0,
23
  "rewards/final_correctness_reward_func": 0.3125,
24
+ "rewards/question_recreation_reward_func": 0.3277840710361488,
25
+ "rewards/soft_format_reward_func": 0.015625,
26
  "rewards/strict_format_reward_func": 0.0,
27
+ "rewards/xmlcount_reward_func": 0.030343750026077032,
28
  "step": 2
29
  },
30
  {
31
+ "completion_length": 314.40625,
32
+ "epoch": 0.6666666666666666,
33
+ "grad_norm": 4.825911045074463,
34
+ "kl": 0.0020024704354000278,
35
+ "learning_rate": 4.869132927957006e-07,
36
  "loss": 0.0,
37
+ "reward": 0.5271140232216567,
38
+ "reward_std": 0.5235589223448187,
39
+ "rewards/concensus_correctness_reward_func": 0.07243750058114529,
40
  "rewards/consensus_reward_func": 0.0,
41
  "rewards/cumulative_reward_2": 0.0,
42
  "rewards/final_correctness_reward_func": 0.0,
43
+ "rewards/question_recreation_reward_func": 0.3320827658753842,
44
  "rewards/soft_format_reward_func": 0.0,
45
  "rewards/strict_format_reward_func": 0.0,
46
+ "rewards/xmlcount_reward_func": 0.12259375443682075,
47
  "step": 4
48
  },
49
  {
50
+ "completion_length": 382.9375,
51
+ "epoch": 1.0,
52
+ "grad_norm": 2.1020917892456055,
53
+ "kl": 0.0017269225791096687,
54
+ "learning_rate": 4.642142940418973e-07,
55
  "loss": 0.0,
56
+ "reward": 0.7331130523234606,
57
+ "reward_std": 0.7579053556546569,
58
+ "rewards/concensus_correctness_reward_func": 0.03125,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  "rewards/consensus_reward_func": 0.0,
60
  "rewards/cumulative_reward_2": 0.0,
61
  "rewards/final_correctness_reward_func": 0.125,
62
+ "rewards/question_recreation_reward_func": 0.386425550095737,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  "rewards/soft_format_reward_func": 0.0,
64
  "rewards/strict_format_reward_func": 0.0,
65
+ "rewards/xmlcount_reward_func": 0.19043749384582043,
66
+ "step": 6
67
  },
68
  {
69
+ "completion_length": 298.71875,
70
  "epoch": 1.3333333333333333,
71
+ "grad_norm": 2.3231563568115234,
72
+ "kl": 0.011588757915887982,
73
+ "learning_rate": 4.314988729807827e-07,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  "loss": 0.0,
75
+ "reward": 0.960457656532526,
76
+ "reward_std": 0.9214519280940294,
77
+ "rewards/concensus_correctness_reward_func": 0.0,
78
+ "rewards/consensus_reward_func": 0.125,
79
  "rewards/cumulative_reward_2": 0.0,
80
+ "rewards/final_correctness_reward_func": 0.1875,
81
+ "rewards/question_recreation_reward_func": 0.458113893866539,
82
  "rewards/soft_format_reward_func": 0.0,
83
  "rewards/strict_format_reward_func": 0.0,
84
+ "rewards/xmlcount_reward_func": 0.18984374217689037,
85
+ "step": 8
86
  },
87
  {
88
+ "completion_length": 343.65625,
89
+ "epoch": 1.6666666666666665,
90
+ "grad_norm": 12.242687225341797,
91
+ "kl": 0.0029063843467156403,
92
+ "learning_rate": 3.902967663405956e-07,
93
  "loss": 0.0,
94
+ "reward": 0.7999913152307272,
95
+ "reward_std": 0.7998636066913605,
96
+ "rewards/concensus_correctness_reward_func": 0.0625,
97
  "rewards/consensus_reward_func": 0.0,
98
  "rewards/cumulative_reward_2": 0.0,
99
  "rewards/final_correctness_reward_func": 0.0625,
100
+ "rewards/question_recreation_reward_func": 0.37349133379757404,
101
+ "rewards/soft_format_reward_func": 0.015625,
102
+ "rewards/strict_format_reward_func": 0.015625,
103
+ "rewards/xmlcount_reward_func": 0.2702499981969595,
104
+ "step": 10
105
  },
106
  {
107
+ "completion_length": 387.0625,
108
  "epoch": 2.0,
109
+ "grad_norm": 4.154313087463379,
110
+ "kl": 0.003268745364039205,
111
+ "learning_rate": 3.4253453883497864e-07,
112
  "loss": 0.0,
113
+ "reward": 0.38145725440699607,
114
+ "reward_std": 0.7882596589624882,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  "rewards/concensus_correctness_reward_func": 0.0,
116
  "rewards/consensus_reward_func": 0.0,
117
  "rewards/cumulative_reward_2": 0.0,
118
+ "rewards/final_correctness_reward_func": 0.125,
119
+ "rewards/question_recreation_reward_func": 0.24976976320613176,
120
  "rewards/soft_format_reward_func": 0.0,
121
  "rewards/strict_format_reward_func": 0.0,
122
+ "rewards/xmlcount_reward_func": 0.006687496788799763,
123
+ "step": 12
124
  },
125
  {
126
+ "completion_length": 337.4375,
127
+ "epoch": 2.3333333333333335,
128
+ "grad_norm": 5.914424419403076,
129
+ "kl": 0.0017011765885399655,
130
+ "learning_rate": 2.9044549913819124e-07,
131
  "loss": 0.0,
132
+ "reward": 0.5687886712839827,
133
+ "reward_std": 0.7552150477422401,
134
+ "rewards/concensus_correctness_reward_func": 0.14237500727176666,
135
  "rewards/consensus_reward_func": 0.0,
136
  "rewards/cumulative_reward_2": 0.0,
137
+ "rewards/final_correctness_reward_func": 0.0625,
138
+ "rewards/question_recreation_reward_func": 0.2534136431058869,
139
  "rewards/soft_format_reward_func": 0.0,
140
  "rewards/strict_format_reward_func": 0.0,
141
+ "rewards/xmlcount_reward_func": 0.11049999855458736,
142
+ "step": 14
143
  },
144
  {
145
+ "completion_length": 382.96875,
146
  "epoch": 2.6666666666666665,
147
+ "grad_norm": 3.6555304527282715,
148
+ "kl": 0.002678310018382035,
149
+ "learning_rate": 2.3646527285364563e-07,
150
  "loss": 0.0,
151
+ "reward": 0.5685348343104124,
152
+ "reward_std": 0.7404244616627693,
153
+ "rewards/concensus_correctness_reward_func": 0.03125,
154
+ "rewards/consensus_reward_func": 0.0625,
155
  "rewards/cumulative_reward_2": 0.0,
156
+ "rewards/final_correctness_reward_func": 0.0625,
157
+ "rewards/question_recreation_reward_func": 0.2857223302125931,
158
  "rewards/soft_format_reward_func": 0.0,
159
  "rewards/strict_format_reward_func": 0.0,
160
+ "rewards/xmlcount_reward_func": 0.126562493853271,
161
+ "step": 16
162
  },
163
  {
164
+ "completion_length": 356.0,
165
+ "epoch": 3.0,
166
+ "grad_norm": 9.670379638671875,
167
+ "kl": 0.002987725820275955,
168
+ "learning_rate": 1.8311791536769483e-07,
169
  "loss": 0.0,
170
+ "reward": 1.5724409651011229,
171
+ "reward_std": 1.7105311714112759,
172
+ "rewards/concensus_correctness_reward_func": 0.65625,
173
  "rewards/consensus_reward_func": 0.0,
174
  "rewards/cumulative_reward_2": 0.0,
175
+ "rewards/final_correctness_reward_func": 0.3125,
176
+ "rewards/question_recreation_reward_func": 0.4052846720442176,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  "rewards/soft_format_reward_func": 0.0,
178
  "rewards/strict_format_reward_func": 0.0,
179
+ "rewards/xmlcount_reward_func": 0.19840626372024417,
180
+ "step": 18
181
  },
182
  {
183
+ "completion_length": 354.96875,
184
  "epoch": 3.3333333333333335,
185
+ "grad_norm": 2.7042019367218018,
186
+ "kl": 0.0031882738694548607,
187
+ "learning_rate": 1.328978898250525e-07,
188
  "loss": 0.0,
189
+ "reward": 0.5442854713182896,
190
+ "reward_std": 0.4820221420377493,
191
+ "rewards/concensus_correctness_reward_func": 0.0,
192
  "rewards/consensus_reward_func": 0.0625,
193
  "rewards/cumulative_reward_2": 0.0,
194
  "rewards/final_correctness_reward_func": 0.0,
195
+ "rewards/question_recreation_reward_func": 0.3049104630481452,
196
  "rewards/soft_format_reward_func": 0.0,
197
  "rewards/strict_format_reward_func": 0.0,
198
+ "rewards/xmlcount_reward_func": 0.1768750064074993,
199
+ "step": 20
200
  },
201
  {
202
+ "completion_length": 305.6875,
203
+ "epoch": 3.6666666666666665,
204
+ "grad_norm": 1.809343934059143,
205
+ "kl": 0.004354253404017072,
206
+ "learning_rate": 8.81534288045431e-08,
207
  "loss": 0.0,
208
+ "reward": 0.6040084585547447,
209
+ "reward_std": 0.7027890784665942,
210
+ "rewards/concensus_correctness_reward_func": 0.03125,
211
+ "rewards/consensus_reward_func": 0.0625,
212
  "rewards/cumulative_reward_2": 0.0,
213
  "rewards/final_correctness_reward_func": 0.0,
214
+ "rewards/question_recreation_reward_func": 0.3406959716230631,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  "rewards/soft_format_reward_func": 0.0,
216
  "rewards/strict_format_reward_func": 0.0,
217
+ "rewards/xmlcount_reward_func": 0.16956250369548798,
218
+ "step": 22
219
  },
220
  {
221
+ "completion_length": 341.59375,
222
  "epoch": 4.0,
223
+ "grad_norm": 2.9913904666900635,
224
+ "kl": 0.0018703212990658358,
225
+ "learning_rate": 5.097673357358906e-08,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  "loss": 0.0,
227
+ "reward": 0.7669433043338358,
228
+ "reward_std": 0.7492063250392675,
229
+ "rewards/concensus_correctness_reward_func": 0.0,
230
  "rewards/consensus_reward_func": 0.0,
231
  "rewards/cumulative_reward_2": 0.0,
232
  "rewards/final_correctness_reward_func": 0.25,
233
+ "rewards/question_recreation_reward_func": 0.2261932883411646,
234
  "rewards/soft_format_reward_func": 0.0,
235
  "rewards/strict_format_reward_func": 0.0,
236
+ "rewards/xmlcount_reward_func": 0.2907500099390745,
237
+ "step": 24
238
  },
239
  {
240
+ "completion_length": 330.8125,
241
+ "epoch": 4.333333333333333,
242
+ "grad_norm": 2.7992258071899414,
243
+ "kl": 0.0020362258510431275,
244
+ "learning_rate": 2.3106145082260774e-08,
245
  "loss": 0.0,
246
+ "reward": 0.8596324669197202,
247
+ "reward_std": 0.7516352757811546,
248
  "rewards/concensus_correctness_reward_func": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
  "rewards/consensus_reward_func": 0.0625,
250
  "rewards/cumulative_reward_2": 0.0,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  "rewards/final_correctness_reward_func": 0.0625,
252
+ "rewards/question_recreation_reward_func": 0.447476205881685,
253
  "rewards/soft_format_reward_func": 0.0,
254
  "rewards/strict_format_reward_func": 0.0,
255
+ "rewards/xmlcount_reward_func": 0.2871562447398901,
256
+ "step": 26
257
  },
258
  {
259
+ "completion_length": 280.90625,
260
+ "epoch": 4.666666666666667,
261
+ "grad_norm": 4.421070098876953,
262
+ "kl": 0.0036720147472806275,
263
+ "learning_rate": 5.844861072478335e-09,
264
  "loss": 0.0,
265
+ "reward": 0.7156252432614565,
266
+ "reward_std": 0.6206880370154977,
267
+ "rewards/concensus_correctness_reward_func": 0.03125,
268
+ "rewards/consensus_reward_func": 0.125,
269
  "rewards/cumulative_reward_2": 0.0,
270
  "rewards/final_correctness_reward_func": 0.0625,
271
+ "rewards/question_recreation_reward_func": 0.35737523529678583,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  "rewards/soft_format_reward_func": 0.0,
273
  "rewards/strict_format_reward_func": 0.0,
274
+ "rewards/xmlcount_reward_func": 0.13949999772012234,
275
+ "step": 28
276
  },
277
  {
278
+ "completion_length": 392.6875,
279
+ "epoch": 5.0,
280
+ "grad_norm": 2.573320150375366,
281
+ "kl": 0.002341544590308331,
282
  "learning_rate": 0.0,
283
  "loss": 0.0,
284
+ "reward": 0.289842139929533,
285
+ "reward_std": 0.8682990339584649,
286
  "rewards/concensus_correctness_reward_func": 0.0,
287
  "rewards/consensus_reward_func": 0.0,
288
  "rewards/cumulative_reward_2": 0.0,
289
+ "rewards/final_correctness_reward_func": 0.125,
290
+ "rewards/question_recreation_reward_func": 0.22246714239008725,
291
  "rewards/soft_format_reward_func": 0.0,
292
  "rewards/strict_format_reward_func": 0.0,
293
+ "rewards/xmlcount_reward_func": -0.05762500315904617,
294
+ "step": 30
295
  },
296
  {
297
+ "epoch": 5.0,
298
+ "step": 30,
299
  "total_flos": 0.0,
300
+ "train_loss": 3.2592993572203946e-06,
301
+ "train_runtime": 1944.3016,
302
+ "train_samples_per_second": 0.247,
303
+ "train_steps_per_second": 0.015
304
  }
305
  ],
306
  "logging_steps": 2,
307
+ "max_steps": 30,
308
  "num_input_tokens_seen": 0,
309
+ "num_train_epochs": 5,
310
  "save_steps": 25,
311
  "stateful_callbacks": {
312
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:917faed80e8f0d32c7a502f8387188ce0f22101cba52b5ab4837cce72832a98f
3
  size 5944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6738a9deb710effecf6a6117a558fe1d30598f3c1804870a6a1ddc1103b3f35
3
  size 5944