Attila1011 commited on
Commit
90bfee7
·
verified ·
1 Parent(s): dbd9fcf

Upload folder using huggingface_hub

Browse files
checkpoints-v3/checkpoint-7168/eval_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints-v3/checkpoint-7168/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f7daf11bb4eaab952771d897197a4c2cf99e395c51912e1a734b70b8761a043
3
+ size 37402680
checkpoints-v3/checkpoint-7168/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c895e6a2893c3a519125ef0880ccd3b676b305dfca932aedac9bc49b9c54e7d9
3
+ size 512267
checkpoints-v3/checkpoint-7168/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01c01c678fa29a9262064ade7c90c5c12c8ca1b0c16a002081bbaef811e78e00
3
+ size 14645
checkpoints-v3/checkpoint-7168/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d595dc04756955b718dbe40e12e3b42e9a74ec09bbdeec39a22714665de3cd13
3
+ size 1383
checkpoints-v3/checkpoint-7168/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1accfb8c9e4435c988a0a31d91d0a223e4cb602232ad6ce46b39c8faafb8ab9d
3
+ size 1465
checkpoints-v3/checkpoint-7168/trainer_state.json ADDED
@@ -0,0 +1,573 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.07450407704020913,
6
+ "eval_steps": 1024,
7
+ "global_step": 7168,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.002660859894293183,
14
+ "grad_norm": 1.2474960088729858,
15
+ "learning_rate": 4.416349151368202e-06,
16
+ "loss": 10.7959,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.005321719788586366,
21
+ "grad_norm": 0.9909601807594299,
22
+ "learning_rate": 8.85001731901628e-06,
23
+ "loss": 10.0541,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.00798257968287955,
28
+ "grad_norm": 0.8707857728004456,
29
+ "learning_rate": 1.3283685486664357e-05,
30
+ "loss": 9.0128,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.010643439577172733,
35
+ "grad_norm": 0.8646675944328308,
36
+ "learning_rate": 1.7717353654312436e-05,
37
+ "loss": 8.0258,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.010643439577172733,
42
+ "eval_acr_loss": 0.010622835965477861,
43
+ "eval_across_var": 0.012448028806829825,
44
+ "eval_bleu": 0.09796718659972453,
45
+ "eval_ce_loss": 7.011146575212479,
46
+ "eval_cos_loss": 0.9561333861202002,
47
+ "eval_cov": 0.06968498229980469,
48
+ "eval_cov_loss": 0.007897357034380548,
49
+ "eval_glb_loss": 0.4064090773463249,
50
+ "eval_global_kurtosis": 3.1536246612668037,
51
+ "eval_global_mean": -0.0043766796588897705,
52
+ "eval_global_var": 0.26236724853515625,
53
+ "eval_krt_loss": 0.02414830235647969,
54
+ "eval_loss": 7.440777659416199,
55
+ "eval_mean_loss": 2.432924324580199e-05,
56
+ "eval_mse_loss": 1.9190465211868286,
57
+ "eval_per_loss": 0.3546135723590851,
58
+ "eval_per_var": 0.25460052490234375,
59
+ "eval_within_var": 0.2497538859024644,
60
+ "eval_wth_loss": 0.42283743154257536,
61
+ "step": 1024
62
+ },
63
+ {
64
+ "epoch": 0.010643439577172733,
65
+ "eval_acr_loss": 0.010622835965477861,
66
+ "eval_across_var": 0.012448028806829825,
67
+ "eval_bleu": 0.09796718659972453,
68
+ "eval_ce_loss": 7.011146575212479,
69
+ "eval_cos_loss": 0.9561333861202002,
70
+ "eval_cov": 0.06968498229980469,
71
+ "eval_cov_loss": 0.007897357034380548,
72
+ "eval_glb_loss": 0.4064090773463249,
73
+ "eval_global_kurtosis": 3.1536246612668037,
74
+ "eval_global_mean": -0.0043766796588897705,
75
+ "eval_global_var": 0.26236724853515625,
76
+ "eval_krt_loss": 0.02414830235647969,
77
+ "eval_loss": 7.440777659416199,
78
+ "eval_mean_loss": 2.432924324580199e-05,
79
+ "eval_mse_loss": 1.9190465211868286,
80
+ "eval_per_loss": 0.3546135723590851,
81
+ "eval_per_var": 0.25460052490234375,
82
+ "eval_runtime": 10.2924,
83
+ "eval_samples_per_second": 194.318,
84
+ "eval_steps_per_second": 3.109,
85
+ "eval_within_var": 0.2497538859024644,
86
+ "eval_wth_loss": 0.42283743154257536,
87
+ "step": 1024
88
+ },
89
+ {
90
+ "epoch": 0.013304299471465915,
91
+ "grad_norm": 0.8640124797821045,
92
+ "learning_rate": 2.2151021821960514e-05,
93
+ "loss": 7.0717,
94
+ "step": 1280
95
+ },
96
+ {
97
+ "epoch": 0.0159651593657591,
98
+ "grad_norm": 0.7730758190155029,
99
+ "learning_rate": 2.6584689989608592e-05,
100
+ "loss": 6.1186,
101
+ "step": 1536
102
+ },
103
+ {
104
+ "epoch": 0.018626019260052282,
105
+ "grad_norm": 0.6963288187980652,
106
+ "learning_rate": 3.1018358157256674e-05,
107
+ "loss": 5.2034,
108
+ "step": 1792
109
+ },
110
+ {
111
+ "epoch": 0.021286879154345465,
112
+ "grad_norm": 0.5616676211357117,
113
+ "learning_rate": 3.5452026324904745e-05,
114
+ "loss": 4.3736,
115
+ "step": 2048
116
+ },
117
+ {
118
+ "epoch": 0.021286879154345465,
119
+ "eval_acr_loss": 0.012247161677805707,
120
+ "eval_across_var": 0.012088194896932691,
121
+ "eval_bleu": 0.3414767015551533,
122
+ "eval_ce_loss": 3.459592819213867,
123
+ "eval_cos_loss": 0.9247305598109961,
124
+ "eval_cov": 0.0738067626953125,
125
+ "eval_cov_loss": 0.008805307501461357,
126
+ "eval_glb_loss": 0.354451559484005,
127
+ "eval_global_kurtosis": 3.0689163729548454,
128
+ "eval_global_mean": -0.004139065742492676,
129
+ "eval_global_var": 0.30448150634765625,
130
+ "eval_krt_loss": 0.005096593113194103,
131
+ "eval_loss": 3.872511200606823,
132
+ "eval_mean_loss": 2.0987964205687604e-05,
133
+ "eval_mse_loss": 1.9079551436007023,
134
+ "eval_per_loss": 0.3076172471046448,
135
+ "eval_per_var": 0.29547882080078125,
136
+ "eval_within_var": 0.29124921560287476,
137
+ "eval_wth_loss": 0.3706007469445467,
138
+ "step": 2048
139
+ },
140
+ {
141
+ "epoch": 0.021286879154345465,
142
+ "eval_acr_loss": 0.012247161677805707,
143
+ "eval_across_var": 0.012088194896932691,
144
+ "eval_bleu": 0.3414767015551533,
145
+ "eval_ce_loss": 3.459592819213867,
146
+ "eval_cos_loss": 0.9247305598109961,
147
+ "eval_cov": 0.0738067626953125,
148
+ "eval_cov_loss": 0.008805307501461357,
149
+ "eval_glb_loss": 0.354451559484005,
150
+ "eval_global_kurtosis": 3.0689163729548454,
151
+ "eval_global_mean": -0.004139065742492676,
152
+ "eval_global_var": 0.30448150634765625,
153
+ "eval_krt_loss": 0.005096593113194103,
154
+ "eval_loss": 3.872511200606823,
155
+ "eval_mean_loss": 2.0987964205687604e-05,
156
+ "eval_mse_loss": 1.9079551436007023,
157
+ "eval_per_loss": 0.3076172471046448,
158
+ "eval_per_var": 0.29547882080078125,
159
+ "eval_runtime": 9.9029,
160
+ "eval_samples_per_second": 201.962,
161
+ "eval_steps_per_second": 3.231,
162
+ "eval_within_var": 0.29124921560287476,
163
+ "eval_wth_loss": 0.3706007469445467,
164
+ "step": 2048
165
+ },
166
+ {
167
+ "epoch": 0.023947739048638648,
168
+ "grad_norm": 0.44369781017303467,
169
+ "learning_rate": 3.988569449255283e-05,
170
+ "loss": 3.6681,
171
+ "step": 2304
172
+ },
173
+ {
174
+ "epoch": 0.02660859894293183,
175
+ "grad_norm": 0.3687000274658203,
176
+ "learning_rate": 4.43193626602009e-05,
177
+ "loss": 3.0886,
178
+ "step": 2560
179
+ },
180
+ {
181
+ "epoch": 0.029269458837225013,
182
+ "grad_norm": 0.3568866550922394,
183
+ "learning_rate": 4.875303082784898e-05,
184
+ "loss": 2.6049,
185
+ "step": 2816
186
+ },
187
+ {
188
+ "epoch": 0.0319303187315182,
189
+ "grad_norm": 0.2986361086368561,
190
+ "learning_rate": 4.9999520413849384e-05,
191
+ "loss": 2.2063,
192
+ "step": 3072
193
+ },
194
+ {
195
+ "epoch": 0.0319303187315182,
196
+ "eval_acr_loss": 0.011944463331019506,
197
+ "eval_across_var": 0.025048962590517476,
198
+ "eval_bleu": 0.5783938497071468,
199
+ "eval_ce_loss": 1.5567151941359043,
200
+ "eval_cos_loss": 0.8373732026666403,
201
+ "eval_cov": 0.10790634155273438,
202
+ "eval_cov_loss": 0.01786720016389154,
203
+ "eval_glb_loss": 0.08421005308628082,
204
+ "eval_global_kurtosis": 3.042153775691986,
205
+ "eval_global_mean": -0.0013459473848342896,
206
+ "eval_global_var": 0.60980224609375,
207
+ "eval_krt_loss": 0.0020023469523948734,
208
+ "eval_loss": 1.9125033244490623,
209
+ "eval_mean_loss": 9.189085614202952e-06,
210
+ "eval_mse_loss": 1.798950683325529,
211
+ "eval_per_loss": 0.06572123290970922,
212
+ "eval_per_var": 0.59381103515625,
213
+ "eval_within_var": 0.5743193719536066,
214
+ "eval_wth_loss": 0.10620259935967624,
215
+ "step": 3072
216
+ },
217
+ {
218
+ "epoch": 0.0319303187315182,
219
+ "eval_acr_loss": 0.011944463331019506,
220
+ "eval_across_var": 0.025048962590517476,
221
+ "eval_bleu": 0.5783938497071468,
222
+ "eval_ce_loss": 1.5567151941359043,
223
+ "eval_cos_loss": 0.8373732026666403,
224
+ "eval_cov": 0.10790634155273438,
225
+ "eval_cov_loss": 0.01786720016389154,
226
+ "eval_glb_loss": 0.08421005308628082,
227
+ "eval_global_kurtosis": 3.042153775691986,
228
+ "eval_global_mean": -0.0013459473848342896,
229
+ "eval_global_var": 0.60980224609375,
230
+ "eval_krt_loss": 0.0020023469523948734,
231
+ "eval_loss": 1.9125033244490623,
232
+ "eval_mean_loss": 9.189085614202952e-06,
233
+ "eval_mse_loss": 1.798950683325529,
234
+ "eval_per_loss": 0.06572123290970922,
235
+ "eval_per_var": 0.59381103515625,
236
+ "eval_runtime": 10.4973,
237
+ "eval_samples_per_second": 190.525,
238
+ "eval_steps_per_second": 3.048,
239
+ "eval_within_var": 0.5743193719536066,
240
+ "eval_wth_loss": 0.10620259935967624,
241
+ "step": 3072
242
+ },
243
+ {
244
+ "epoch": 0.03459117862581138,
245
+ "grad_norm": 0.270656943321228,
246
+ "learning_rate": 4.9997257606389056e-05,
247
+ "loss": 1.8881,
248
+ "step": 3328
249
+ },
250
+ {
251
+ "epoch": 0.037252038520104565,
252
+ "grad_norm": 0.24188651144504547,
253
+ "learning_rate": 4.999313831167736e-05,
254
+ "loss": 1.6388,
255
+ "step": 3584
256
+ },
257
+ {
258
+ "epoch": 0.03991289841439775,
259
+ "grad_norm": 0.2294900268316269,
260
+ "learning_rate": 4.998716283564454e-05,
261
+ "loss": 1.4382,
262
+ "step": 3840
263
+ },
264
+ {
265
+ "epoch": 0.04257375830869093,
266
+ "grad_norm": 0.20773501694202423,
267
+ "learning_rate": 4.99793316220751e-05,
268
+ "loss": 1.2713,
269
+ "step": 4096
270
+ },
271
+ {
272
+ "epoch": 0.04257375830869093,
273
+ "eval_acr_loss": 0.011318061951897107,
274
+ "eval_across_var": 0.037467821151949465,
275
+ "eval_bleu": 0.7398917090811331,
276
+ "eval_ce_loss": 0.8147697541862726,
277
+ "eval_cos_loss": 0.739902313798666,
278
+ "eval_cov": 0.09920120239257812,
279
+ "eval_cov_loss": 0.015144521807087585,
280
+ "eval_glb_loss": 0.0026292089896742254,
281
+ "eval_global_kurtosis": 3.0431209057569504,
282
+ "eval_global_mean": 0.0002828165888786316,
283
+ "eval_global_var": 0.849639892578125,
284
+ "eval_krt_loss": 0.002429395680081825,
285
+ "eval_loss": 1.1247494276612997,
286
+ "eval_mean_loss": 1.0841616662204956e-05,
287
+ "eval_mse_loss": 1.6548683494329453,
288
+ "eval_per_loss": 0.0006620931362704141,
289
+ "eval_per_var": 0.826202392578125,
290
+ "eval_within_var": 0.8049256391823292,
291
+ "eval_wth_loss": 0.009282346058171242,
292
+ "step": 4096
293
+ },
294
+ {
295
+ "epoch": 0.04257375830869093,
296
+ "eval_acr_loss": 0.011318061951897107,
297
+ "eval_across_var": 0.037467821151949465,
298
+ "eval_bleu": 0.7398917090811331,
299
+ "eval_ce_loss": 0.8147697541862726,
300
+ "eval_cos_loss": 0.739902313798666,
301
+ "eval_cov": 0.09920120239257812,
302
+ "eval_cov_loss": 0.015144521807087585,
303
+ "eval_glb_loss": 0.0026292089896742254,
304
+ "eval_global_kurtosis": 3.0431209057569504,
305
+ "eval_global_mean": 0.0002828165888786316,
306
+ "eval_global_var": 0.849639892578125,
307
+ "eval_krt_loss": 0.002429395680081825,
308
+ "eval_loss": 1.1247494276612997,
309
+ "eval_mean_loss": 1.0841616662204956e-05,
310
+ "eval_mse_loss": 1.6548683494329453,
311
+ "eval_per_loss": 0.0006620931362704141,
312
+ "eval_per_var": 0.826202392578125,
313
+ "eval_runtime": 10.0939,
314
+ "eval_samples_per_second": 198.14,
315
+ "eval_steps_per_second": 3.17,
316
+ "eval_within_var": 0.8049256391823292,
317
+ "eval_wth_loss": 0.009282346058171242,
318
+ "step": 4096
319
+ },
320
+ {
321
+ "epoch": 0.04523461820298411,
322
+ "grad_norm": 0.1941945105791092,
323
+ "learning_rate": 4.996964525257477e-05,
324
+ "loss": 1.1364,
325
+ "step": 4352
326
+ },
327
+ {
328
+ "epoch": 0.047895478097277296,
329
+ "grad_norm": 0.17706365883350372,
330
+ "learning_rate": 4.995810444652731e-05,
331
+ "loss": 1.0202,
332
+ "step": 4608
333
+ },
334
+ {
335
+ "epoch": 0.05055633799157048,
336
+ "grad_norm": 0.17764592170715332,
337
+ "learning_rate": 4.994471006104112e-05,
338
+ "loss": 0.9256,
339
+ "step": 4864
340
+ },
341
+ {
342
+ "epoch": 0.05321719788586366,
343
+ "grad_norm": 0.1597519963979721,
344
+ "learning_rate": 4.992946309088557e-05,
345
+ "loss": 0.8433,
346
+ "step": 5120
347
+ },
348
+ {
349
+ "epoch": 0.05321719788586366,
350
+ "eval_acr_loss": 0.010796478512929752,
351
+ "eval_across_var": 0.0437286015949212,
352
+ "eval_bleu": 0.8359253777154618,
353
+ "eval_ce_loss": 0.4896330190822482,
354
+ "eval_cos_loss": 0.6560099385678768,
355
+ "eval_cov": 0.08585166931152344,
356
+ "eval_cov_loss": 0.011597162316320464,
357
+ "eval_glb_loss": 0.0,
358
+ "eval_global_kurtosis": 3.051339641213417,
359
+ "eval_global_mean": 0.0004043206572532654,
360
+ "eval_global_var": 0.9364166259765625,
361
+ "eval_krt_loss": 0.0034133701161636054,
362
+ "eval_loss": 0.7659010197967291,
363
+ "eval_mean_loss": 1.1450480416286268e-05,
364
+ "eval_mse_loss": 1.5238465368747711,
365
+ "eval_per_loss": 0.0,
366
+ "eval_per_var": 0.9102935791015625,
367
+ "eval_within_var": 0.8955719340592623,
368
+ "eval_wth_loss": 0.00021194279955238926,
369
+ "step": 5120
370
+ },
371
+ {
372
+ "epoch": 0.05321719788586366,
373
+ "eval_acr_loss": 0.010796478512929752,
374
+ "eval_across_var": 0.0437286015949212,
375
+ "eval_bleu": 0.8359253777154618,
376
+ "eval_ce_loss": 0.4896330190822482,
377
+ "eval_cos_loss": 0.6560099385678768,
378
+ "eval_cov": 0.08585166931152344,
379
+ "eval_cov_loss": 0.011597162316320464,
380
+ "eval_glb_loss": 0.0,
381
+ "eval_global_kurtosis": 3.051339641213417,
382
+ "eval_global_mean": 0.0004043206572532654,
383
+ "eval_global_var": 0.9364166259765625,
384
+ "eval_krt_loss": 0.0034133701161636054,
385
+ "eval_loss": 0.7659010197967291,
386
+ "eval_mean_loss": 1.1450480416286268e-05,
387
+ "eval_mse_loss": 1.5238465368747711,
388
+ "eval_per_loss": 0.0,
389
+ "eval_per_var": 0.9102935791015625,
390
+ "eval_runtime": 10.0333,
391
+ "eval_samples_per_second": 199.336,
392
+ "eval_steps_per_second": 3.189,
393
+ "eval_within_var": 0.8955719340592623,
394
+ "eval_wth_loss": 0.00021194279955238926,
395
+ "step": 5120
396
+ },
397
+ {
398
+ "epoch": 0.055878057780156844,
399
+ "grad_norm": 0.15128253400325775,
400
+ "learning_rate": 4.991236466841708e-05,
401
+ "loss": 0.7748,
402
+ "step": 5376
403
+ },
404
+ {
405
+ "epoch": 0.058538917674450026,
406
+ "grad_norm": 0.15075387060642242,
407
+ "learning_rate": 4.989341606349509e-05,
408
+ "loss": 0.7149,
409
+ "step": 5632
410
+ },
411
+ {
412
+ "epoch": 0.06119977756874321,
413
+ "grad_norm": 0.13722559809684753,
414
+ "learning_rate": 4.987261868338772e-05,
415
+ "loss": 0.6633,
416
+ "step": 5888
417
+ },
418
+ {
419
+ "epoch": 0.0638606374630364,
420
+ "grad_norm": 0.14299507439136505,
421
+ "learning_rate": 4.9849974072667235e-05,
422
+ "loss": 0.6168,
423
+ "step": 6144
424
+ },
425
+ {
426
+ "epoch": 0.0638606374630364,
427
+ "eval_acr_loss": 0.010568196172243915,
428
+ "eval_across_var": 0.050391704426147044,
429
+ "eval_bleu": 0.8864417334039504,
430
+ "eval_ce_loss": 0.3192982799373567,
431
+ "eval_cos_loss": 0.5848112031817436,
432
+ "eval_cov": 0.08610343933105469,
433
+ "eval_cov_loss": 0.011645367194432765,
434
+ "eval_glb_loss": 0.0,
435
+ "eval_global_kurtosis": 3.057781808078289,
436
+ "eval_global_mean": 0.00010520219802856445,
437
+ "eval_global_var": 1.05322265625,
438
+ "eval_krt_loss": 0.00413643000592856,
439
+ "eval_loss": 0.5672316299751401,
440
+ "eval_mean_loss": 1.1898590268621945e-05,
441
+ "eval_mse_loss": 1.4093649201095104,
442
+ "eval_per_loss": 0.0,
443
+ "eval_per_var": 1.0248565673828125,
444
+ "eval_within_var": 1.0096650514751673,
445
+ "eval_wth_loss": 0.0,
446
+ "step": 6144
447
+ },
448
+ {
449
+ "epoch": 0.0638606374630364,
450
+ "eval_acr_loss": 0.010568196172243915,
451
+ "eval_across_var": 0.050391704426147044,
452
+ "eval_bleu": 0.8864417334039504,
453
+ "eval_ce_loss": 0.3192982799373567,
454
+ "eval_cos_loss": 0.5848112031817436,
455
+ "eval_cov": 0.08610343933105469,
456
+ "eval_cov_loss": 0.011645367194432765,
457
+ "eval_glb_loss": 0.0,
458
+ "eval_global_kurtosis": 3.057781808078289,
459
+ "eval_global_mean": 0.00010520219802856445,
460
+ "eval_global_var": 1.05322265625,
461
+ "eval_krt_loss": 0.00413643000592856,
462
+ "eval_loss": 0.5672316299751401,
463
+ "eval_mean_loss": 1.1898590268621945e-05,
464
+ "eval_mse_loss": 1.4093649201095104,
465
+ "eval_per_loss": 0.0,
466
+ "eval_per_var": 1.0248565673828125,
467
+ "eval_runtime": 10.495,
468
+ "eval_samples_per_second": 190.567,
469
+ "eval_steps_per_second": 3.049,
470
+ "eval_within_var": 1.0096650514751673,
471
+ "eval_wth_loss": 0.0,
472
+ "step": 6144
473
+ },
474
+ {
475
+ "epoch": 0.06652149735732958,
476
+ "grad_norm": 0.13175231218338013,
477
+ "learning_rate": 4.9825483913095364e-05,
478
+ "loss": 0.5727,
479
+ "step": 6400
480
+ },
481
+ {
482
+ "epoch": 0.06918235725162276,
483
+ "grad_norm": 0.130602166056633,
484
+ "learning_rate": 4.979915002349838e-05,
485
+ "loss": 0.5411,
486
+ "step": 6656
487
+ },
488
+ {
489
+ "epoch": 0.07184321714591595,
490
+ "grad_norm": 0.12843571603298187,
491
+ "learning_rate": 4.977097435963204e-05,
492
+ "loss": 0.5082,
493
+ "step": 6912
494
+ },
495
+ {
496
+ "epoch": 0.07450407704020913,
497
+ "grad_norm": 0.1221570074558258,
498
+ "learning_rate": 4.974095901403632e-05,
499
+ "loss": 0.4775,
500
+ "step": 7168
501
+ },
502
+ {
503
+ "epoch": 0.07450407704020913,
504
+ "eval_acr_loss": 0.01032613120332826,
505
+ "eval_across_var": 0.055548187578096986,
506
+ "eval_bleu": 0.917825685067053,
507
+ "eval_ce_loss": 0.22222592495381832,
508
+ "eval_cos_loss": 0.5258319452404976,
509
+ "eval_cov": 0.0838165283203125,
510
+ "eval_cov_loss": 0.01109178303158842,
511
+ "eval_glb_loss": 0.0011626811420910599,
512
+ "eval_global_kurtosis": 3.0560965314507484,
513
+ "eval_global_mean": -0.00032412633299827576,
514
+ "eval_global_var": 1.1317138671875,
515
+ "eval_krt_loss": 0.003975647037577801,
516
+ "eval_loss": 0.4466686090454459,
517
+ "eval_mean_loss": 1.2274138283974168e-05,
518
+ "eval_mse_loss": 1.3142655715346336,
519
+ "eval_per_loss": 0.0,
520
+ "eval_per_var": 1.1024169921875,
521
+ "eval_within_var": 1.088273286819458,
522
+ "eval_wth_loss": 4.5452433568016204e-05,
523
+ "step": 7168
524
+ },
525
+ {
526
+ "epoch": 0.07450407704020913,
527
+ "eval_acr_loss": 0.01032613120332826,
528
+ "eval_across_var": 0.055548187578096986,
529
+ "eval_bleu": 0.917825685067053,
530
+ "eval_ce_loss": 0.22222592495381832,
531
+ "eval_cos_loss": 0.5258319452404976,
532
+ "eval_cov": 0.0838165283203125,
533
+ "eval_cov_loss": 0.01109178303158842,
534
+ "eval_glb_loss": 0.0011626811420910599,
535
+ "eval_global_kurtosis": 3.0560965314507484,
536
+ "eval_global_mean": -0.00032412633299827576,
537
+ "eval_global_var": 1.1317138671875,
538
+ "eval_krt_loss": 0.003975647037577801,
539
+ "eval_loss": 0.4466686090454459,
540
+ "eval_mean_loss": 1.2274138283974168e-05,
541
+ "eval_mse_loss": 1.3142655715346336,
542
+ "eval_per_loss": 0.0,
543
+ "eval_per_var": 1.1024169921875,
544
+ "eval_runtime": 10.2975,
545
+ "eval_samples_per_second": 194.222,
546
+ "eval_steps_per_second": 3.108,
547
+ "eval_within_var": 1.088273286819458,
548
+ "eval_wth_loss": 4.5452433568016204e-05,
549
+ "step": 7168
550
+ }
551
+ ],
552
+ "logging_steps": 256,
553
+ "max_steps": 96210,
554
+ "num_input_tokens_seen": 0,
555
+ "num_train_epochs": 1,
556
+ "save_steps": 1024,
557
+ "stateful_callbacks": {
558
+ "TrainerControl": {
559
+ "args": {
560
+ "should_epoch_stop": false,
561
+ "should_evaluate": false,
562
+ "should_log": false,
563
+ "should_save": true,
564
+ "should_training_stop": false
565
+ },
566
+ "attributes": {}
567
+ }
568
+ },
569
+ "total_flos": 0.0,
570
+ "train_batch_size": 64,
571
+ "trial_name": null,
572
+ "trial_params": null
573
+ }
checkpoints-v3/checkpoint-7168/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d4bda444d2217dfb8c9657e282f4547df7c92efeb7c6abde9e602028d4dde0a
3
+ size 5777