ChiefTheLord commited on
Commit
dbd9fcf
·
verified ·
1 Parent(s): d6ef836

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -39,3 +39,4 @@ checkpoints-v2.1/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs
39
  checkpoints-v2.2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoints-v2.3/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
41
  checkpoints-v2.5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
 
 
39
  checkpoints-v2.2/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
40
  checkpoints-v2.3/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
41
  checkpoints-v2.5/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoints-v2.6/checkpoint-12288/eval_state.json filter=lfs diff=lfs merge=lfs -text
checkpoints-v2.6/checkpoint-12288/eval_state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64adffb790cb59a22cb9881c4844dc3bbd3e7155ebfe1e01f8f19faa296ec6f6
3
+ size 44107863
checkpoints-v2.6/checkpoint-12288/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cc0c28b61de6dac74c60e45ad7135a0bbe5e446655a19c7fd4b36261cac23b4
3
+ size 37402680
checkpoints-v2.6/checkpoint-12288/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ab51f2a176e646a1010b5f063fb1a1fb5588eeefd32cd561694cb23aad84ba8
3
+ size 512267
checkpoints-v2.6/checkpoint-12288/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f36e6c28b77555b6da6de84681647b558ac8ebc553a1b458e45112e416a213c
3
+ size 14645
checkpoints-v2.6/checkpoint-12288/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6fa220a1da697fc6e732f23a692e1fe49da32dad48c61771249ee01a41c4206
3
+ size 1383
checkpoints-v2.6/checkpoint-12288/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b2b9d7efb9af569cd6999393703c76b185862a0221b931604de9e5bf2b79a6
3
+ size 1465
checkpoints-v2.6/checkpoint-12288/trainer_state.json ADDED
@@ -0,0 +1,958 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.5675488430095608,
6
+ "eval_steps": 1024,
7
+ "global_step": 12288,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011823934229365849,
14
+ "grad_norm": 1.1330132484436035,
15
+ "learning_rate": 1.9615384615384617e-05,
16
+ "loss": 10.3987,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.023647868458731697,
21
+ "grad_norm": 1.1089212894439697,
22
+ "learning_rate": 3.930769230769231e-05,
23
+ "loss": 7.8889,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03547180268809755,
28
+ "grad_norm": 0.8628233075141907,
29
+ "learning_rate": 4.999617095521894e-05,
30
+ "loss": 5.5834,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "grad_norm": 0.5602379441261292,
36
+ "learning_rate": 4.9961092368776736e-05,
37
+ "loss": 3.7587,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.047295736917463395,
42
+ "eval_acr_loss": 0.472869769419165,
43
+ "eval_across_var": 0.004283991528007299,
44
+ "eval_bleu": 0.5621407857406262,
45
+ "eval_ce_loss": 2.3296360490528962,
46
+ "eval_cos_loss": 0.9237595144744333,
47
+ "eval_cov": 0.07785117789490582,
48
+ "eval_cov_loss": 0.009674696304907674,
49
+ "eval_glb_loss": 0.3056357570979149,
50
+ "eval_global_kurtosis": 3.016345693640513,
51
+ "eval_global_mean": -0.0019390732730360336,
52
+ "eval_global_var": 0.3470509150256849,
53
+ "eval_krt_loss": 0.00041134808841301906,
54
+ "eval_loss": 2.7735937942652944,
55
+ "eval_mean_loss": 5.7365890913968915e-06,
56
+ "eval_mse_loss": 1.926733713443965,
57
+ "eval_per_loss": 0.3164680907715401,
58
+ "eval_per_var": 0.337350505672089,
59
+ "eval_within_var": 0.342682637784579,
60
+ "eval_wth_loss": 0.310607642547725,
61
+ "step": 1024
62
+ },
63
+ {
64
+ "epoch": 0.047295736917463395,
65
+ "eval_acr_loss": 0.472869769419165,
66
+ "eval_across_var": 0.004283991528007299,
67
+ "eval_bleu": 0.5621407857406262,
68
+ "eval_ce_loss": 2.3296360490528962,
69
+ "eval_cos_loss": 0.9237595144744333,
70
+ "eval_cov": 0.07785117789490582,
71
+ "eval_cov_loss": 0.009674696304907674,
72
+ "eval_glb_loss": 0.3056357570979149,
73
+ "eval_global_kurtosis": 3.016345693640513,
74
+ "eval_global_mean": -0.0019390732730360336,
75
+ "eval_global_var": 0.3470509150256849,
76
+ "eval_krt_loss": 0.00041134808841301906,
77
+ "eval_loss": 2.7735937942652944,
78
+ "eval_mean_loss": 5.7365890913968915e-06,
79
+ "eval_mse_loss": 1.926733713443965,
80
+ "eval_per_loss": 0.3164680907715401,
81
+ "eval_per_var": 0.337350505672089,
82
+ "eval_runtime": 158.0332,
83
+ "eval_samples_per_second": 177.134,
84
+ "eval_steps_per_second": 2.772,
85
+ "eval_within_var": 0.342682637784579,
86
+ "eval_wth_loss": 0.310607642547725,
87
+ "step": 1024
88
+ },
89
+ {
90
+ "epoch": 0.05911967114682925,
91
+ "grad_norm": 0.3792508840560913,
92
+ "learning_rate": 4.988941132556799e-05,
93
+ "loss": 2.6954,
94
+ "step": 1280
95
+ },
96
+ {
97
+ "epoch": 0.0709436053761951,
98
+ "grad_norm": 0.30784618854522705,
99
+ "learning_rate": 4.9781232937269974e-05,
100
+ "loss": 2.0719,
101
+ "step": 1536
102
+ },
103
+ {
104
+ "epoch": 0.08276753960556095,
105
+ "grad_norm": 0.24569711089134216,
106
+ "learning_rate": 4.963671583455164e-05,
107
+ "loss": 1.664,
108
+ "step": 1792
109
+ },
110
+ {
111
+ "epoch": 0.09459147383492679,
112
+ "grad_norm": 0.20954662561416626,
113
+ "learning_rate": 4.945607193446079e-05,
114
+ "loss": 1.385,
115
+ "step": 2048
116
+ },
117
+ {
118
+ "epoch": 0.09459147383492679,
119
+ "eval_acr_loss": 0.467126801201742,
120
+ "eval_across_var": 0.014505732470706582,
121
+ "eval_bleu": 0.8102335908319817,
122
+ "eval_ce_loss": 0.6907969389331939,
123
+ "eval_cos_loss": 0.7647272306217995,
124
+ "eval_cov": 0.10773917420269692,
125
+ "eval_cov_loss": 0.017931701802567805,
126
+ "eval_glb_loss": 0.000625664478229841,
127
+ "eval_global_kurtosis": 3.000208995113634,
128
+ "eval_global_mean": -0.007508264829034675,
129
+ "eval_global_var": 0.8768238085045662,
130
+ "eval_krt_loss": 7.712448770311225e-05,
131
+ "eval_loss": 1.0269616710268743,
132
+ "eval_mean_loss": 6.29432441826079e-05,
133
+ "eval_mse_loss": 1.6563664652985526,
134
+ "eval_per_loss": 0.0019870867933301875,
135
+ "eval_per_var": 0.8565188891267124,
136
+ "eval_within_var": 0.8617800931407981,
137
+ "eval_wth_loss": 0.0015375203581266992,
138
+ "step": 2048
139
+ },
140
+ {
141
+ "epoch": 0.09459147383492679,
142
+ "eval_acr_loss": 0.467126801201742,
143
+ "eval_across_var": 0.014505732470706582,
144
+ "eval_bleu": 0.8102335908319817,
145
+ "eval_ce_loss": 0.6907969389331939,
146
+ "eval_cos_loss": 0.7647272306217995,
147
+ "eval_cov": 0.10773917420269692,
148
+ "eval_cov_loss": 0.017931701802567805,
149
+ "eval_glb_loss": 0.000625664478229841,
150
+ "eval_global_kurtosis": 3.000208995113634,
151
+ "eval_global_mean": -0.007508264829034675,
152
+ "eval_global_var": 0.8768238085045662,
153
+ "eval_krt_loss": 7.712448770311225e-05,
154
+ "eval_loss": 1.0269616710268743,
155
+ "eval_mean_loss": 6.29432441826079e-05,
156
+ "eval_mse_loss": 1.6563664652985526,
157
+ "eval_per_loss": 0.0019870867933301875,
158
+ "eval_per_var": 0.8565188891267124,
159
+ "eval_runtime": 153.9643,
160
+ "eval_samples_per_second": 181.815,
161
+ "eval_steps_per_second": 2.845,
162
+ "eval_within_var": 0.8617800931407981,
163
+ "eval_wth_loss": 0.0015375203581266992,
164
+ "step": 2048
165
+ },
166
+ {
167
+ "epoch": 0.10641540806429264,
168
+ "grad_norm": 0.181256964802742,
169
+ "learning_rate": 4.923956612967301e-05,
170
+ "loss": 1.1836,
171
+ "step": 2304
172
+ },
173
+ {
174
+ "epoch": 0.1182393422936585,
175
+ "grad_norm": 0.1728077083826065,
176
+ "learning_rate": 4.898751590005826e-05,
177
+ "loss": 1.024,
178
+ "step": 2560
179
+ },
180
+ {
181
+ "epoch": 0.13006327652302435,
182
+ "grad_norm": 0.14891040325164795,
183
+ "learning_rate": 4.870029084713462e-05,
184
+ "loss": 0.9004,
185
+ "step": 2816
186
+ },
187
+ {
188
+ "epoch": 0.1418872107523902,
189
+ "grad_norm": 0.12918385863304138,
190
+ "learning_rate": 4.837831215209188e-05,
191
+ "loss": 0.8021,
192
+ "step": 3072
193
+ },
194
+ {
195
+ "epoch": 0.1418872107523902,
196
+ "eval_acr_loss": 0.46483140122400574,
197
+ "eval_across_var": 0.01694616893323385,
198
+ "eval_bleu": 0.8960603905933928,
199
+ "eval_ce_loss": 0.32335185586181403,
200
+ "eval_cos_loss": 0.6106928648197487,
201
+ "eval_cov": 0.09281384890482305,
202
+ "eval_cov_loss": 0.013490723856388841,
203
+ "eval_glb_loss": 0.0,
204
+ "eval_global_kurtosis": 3.000047793127086,
205
+ "eval_global_mean": -0.0062043639622866836,
206
+ "eval_global_var": 0.9301077786101598,
207
+ "eval_krt_loss": 0.0001482390632322396,
208
+ "eval_loss": 0.5985009211655621,
209
+ "eval_mean_loss": 4.6196501266608506e-05,
210
+ "eval_mse_loss": 1.3684135288952692,
211
+ "eval_per_loss": 6.070195510882551e-06,
212
+ "eval_per_var": 0.906668049015411,
213
+ "eval_within_var": 0.9128873076068756,
214
+ "eval_wth_loss": 7.710680918438013e-07,
215
+ "step": 3072
216
+ },
217
+ {
218
+ "epoch": 0.1418872107523902,
219
+ "eval_acr_loss": 0.46483140122400574,
220
+ "eval_across_var": 0.01694616893323385,
221
+ "eval_bleu": 0.8960603905933928,
222
+ "eval_ce_loss": 0.32335185586181403,
223
+ "eval_cos_loss": 0.6106928648197487,
224
+ "eval_cov": 0.09281384890482305,
225
+ "eval_cov_loss": 0.013490723856388841,
226
+ "eval_glb_loss": 0.0,
227
+ "eval_global_kurtosis": 3.000047793127086,
228
+ "eval_global_mean": -0.0062043639622866836,
229
+ "eval_global_var": 0.9301077786101598,
230
+ "eval_krt_loss": 0.0001482390632322396,
231
+ "eval_loss": 0.5985009211655621,
232
+ "eval_mean_loss": 4.6196501266608506e-05,
233
+ "eval_mse_loss": 1.3684135288952692,
234
+ "eval_per_loss": 6.070195510882551e-06,
235
+ "eval_per_var": 0.906668049015411,
236
+ "eval_runtime": 154.7557,
237
+ "eval_samples_per_second": 180.885,
238
+ "eval_steps_per_second": 2.83,
239
+ "eval_within_var": 0.9128873076068756,
240
+ "eval_wth_loss": 7.710680918438013e-07,
241
+ "step": 3072
242
+ },
243
+ {
244
+ "epoch": 0.15371114498175603,
245
+ "grad_norm": 0.12574972212314606,
246
+ "learning_rate": 4.802205195817963e-05,
247
+ "loss": 0.7236,
248
+ "step": 3328
249
+ },
250
+ {
251
+ "epoch": 0.1655350792111219,
252
+ "grad_norm": 0.12267891317605972,
253
+ "learning_rate": 4.763203267836576e-05,
254
+ "loss": 0.656,
255
+ "step": 3584
256
+ },
257
+ {
258
+ "epoch": 0.17735901344048774,
259
+ "grad_norm": 0.10911328345537186,
260
+ "learning_rate": 4.720882622928019e-05,
261
+ "loss": 0.6002,
262
+ "step": 3840
263
+ },
264
+ {
265
+ "epoch": 0.18918294766985358,
266
+ "grad_norm": 0.10669343918561935,
267
+ "learning_rate": 4.675305319256765e-05,
268
+ "loss": 0.5541,
269
+ "step": 4096
270
+ },
271
+ {
272
+ "epoch": 0.18918294766985358,
273
+ "eval_acr_loss": 0.461427516861049,
274
+ "eval_across_var": 0.02085038360216778,
275
+ "eval_bleu": 0.9355169660485144,
276
+ "eval_ce_loss": 0.18811083091720598,
277
+ "eval_cos_loss": 0.4924817247886092,
278
+ "eval_cov": 0.08738018715218322,
279
+ "eval_cov_loss": 0.011900046778647186,
280
+ "eval_glb_loss": 0.0,
281
+ "eval_global_kurtosis": 3.0061902341232996,
282
+ "eval_global_mean": -0.007297369579201964,
283
+ "eval_global_var": 1.0064546767979452,
284
+ "eval_krt_loss": 0.00036125986233864483,
285
+ "eval_loss": 0.41625422633946213,
286
+ "eval_mean_loss": 6.224468990837522e-05,
287
+ "eval_mse_loss": 1.1413913132393196,
288
+ "eval_per_loss": 0.0,
289
+ "eval_per_var": 0.9814564604737442,
290
+ "eval_within_var": 0.9855326701244808,
291
+ "eval_wth_loss": 0.0,
292
+ "step": 4096
293
+ },
294
+ {
295
+ "epoch": 0.18918294766985358,
296
+ "eval_acr_loss": 0.461427516861049,
297
+ "eval_across_var": 0.02085038360216778,
298
+ "eval_bleu": 0.9355169660485144,
299
+ "eval_ce_loss": 0.18811083091720598,
300
+ "eval_cos_loss": 0.4924817247886092,
301
+ "eval_cov": 0.08738018715218322,
302
+ "eval_cov_loss": 0.011900046778647186,
303
+ "eval_glb_loss": 0.0,
304
+ "eval_global_kurtosis": 3.0061902341232996,
305
+ "eval_global_mean": -0.007297369579201964,
306
+ "eval_global_var": 1.0064546767979452,
307
+ "eval_krt_loss": 0.00036125986233864483,
308
+ "eval_loss": 0.41625422633946213,
309
+ "eval_mean_loss": 6.224468990837522e-05,
310
+ "eval_mse_loss": 1.1413913132393196,
311
+ "eval_per_loss": 0.0,
312
+ "eval_per_var": 0.9814564604737442,
313
+ "eval_runtime": 155.5272,
314
+ "eval_samples_per_second": 179.988,
315
+ "eval_steps_per_second": 2.816,
316
+ "eval_within_var": 0.9855326701244808,
317
+ "eval_wth_loss": 0.0,
318
+ "step": 4096
319
+ },
320
+ {
321
+ "epoch": 0.20100688189921945,
322
+ "grad_norm": 0.10487012565135956,
323
+ "learning_rate": 4.6265381904878854e-05,
324
+ "loss": 0.5121,
325
+ "step": 4352
326
+ },
327
+ {
328
+ "epoch": 0.2128308161285853,
329
+ "grad_norm": 0.09705886989831924,
330
+ "learning_rate": 4.57465274778347e-05,
331
+ "loss": 0.4805,
332
+ "step": 4608
333
+ },
334
+ {
335
+ "epoch": 0.22465475035795113,
336
+ "grad_norm": 0.10024245083332062,
337
+ "learning_rate": 4.519725074940068e-05,
338
+ "loss": 0.4469,
339
+ "step": 4864
340
+ },
341
+ {
342
+ "epoch": 0.236478684587317,
343
+ "grad_norm": 0.08834318816661835,
344
+ "learning_rate": 4.461835716820895e-05,
345
+ "loss": 0.42,
346
+ "step": 5120
347
+ },
348
+ {
349
+ "epoch": 0.236478684587317,
350
+ "eval_acr_loss": 0.4548332993978779,
351
+ "eval_across_var": 0.028460351478087304,
352
+ "eval_bleu": 0.9552258623563333,
353
+ "eval_ce_loss": 0.12340653128071463,
354
+ "eval_cos_loss": 0.40689416910143206,
355
+ "eval_cov": 0.08788992825164098,
356
+ "eval_cov_loss": 0.01204854997563852,
357
+ "eval_glb_loss": 0.0002671638515636362,
358
+ "eval_global_kurtosis": 2.9975850026901454,
359
+ "eval_global_mean": -0.006530939716182343,
360
+ "eval_global_var": 1.1122132741152968,
361
+ "eval_krt_loss": 0.0002464600221175453,
362
+ "eval_loss": 0.3173055400451024,
363
+ "eval_mean_loss": 5.356442062072988e-05,
364
+ "eval_mse_loss": 0.9764057996610528,
365
+ "eval_per_loss": 5.57458498690526e-06,
366
+ "eval_per_var": 1.0869274400684932,
367
+ "eval_within_var": 1.0837734279022913,
368
+ "eval_wth_loss": 5.189207881040453e-06,
369
+ "step": 5120
370
+ },
371
+ {
372
+ "epoch": 0.236478684587317,
373
+ "eval_acr_loss": 0.4548332993978779,
374
+ "eval_across_var": 0.028460351478087304,
375
+ "eval_bleu": 0.9552258623563333,
376
+ "eval_ce_loss": 0.12340653128071463,
377
+ "eval_cos_loss": 0.40689416910143206,
378
+ "eval_cov": 0.08788992825164098,
379
+ "eval_cov_loss": 0.01204854997563852,
380
+ "eval_glb_loss": 0.0002671638515636362,
381
+ "eval_global_kurtosis": 2.9975850026901454,
382
+ "eval_global_mean": -0.006530939716182343,
383
+ "eval_global_var": 1.1122132741152968,
384
+ "eval_krt_loss": 0.0002464600221175453,
385
+ "eval_loss": 0.3173055400451024,
386
+ "eval_mean_loss": 5.356442062072988e-05,
387
+ "eval_mse_loss": 0.9764057996610528,
388
+ "eval_per_loss": 5.57458498690526e-06,
389
+ "eval_per_var": 1.0869274400684932,
390
+ "eval_runtime": 154.1747,
391
+ "eval_samples_per_second": 181.567,
392
+ "eval_steps_per_second": 2.841,
393
+ "eval_within_var": 1.0837734279022913,
394
+ "eval_wth_loss": 5.189207881040453e-06,
395
+ "step": 5120
396
+ },
397
+ {
398
+ "epoch": 0.24830261881668284,
399
+ "grad_norm": 0.08900712430477142,
400
+ "learning_rate": 4.401069561246422e-05,
401
+ "loss": 0.3965,
402
+ "step": 5376
403
+ },
404
+ {
405
+ "epoch": 0.2601265530460487,
406
+ "grad_norm": 0.08696399629116058,
407
+ "learning_rate": 4.337515714516545e-05,
408
+ "loss": 0.3747,
409
+ "step": 5632
410
+ },
411
+ {
412
+ "epoch": 0.27195048727541454,
413
+ "grad_norm": 0.09633397310972214,
414
+ "learning_rate": 4.2712673707468434e-05,
415
+ "loss": 0.3594,
416
+ "step": 5888
417
+ },
418
+ {
419
+ "epoch": 0.2837744215047804,
420
+ "grad_norm": 0.08299960196018219,
421
+ "learning_rate": 4.202421675210565e-05,
422
+ "loss": 0.3425,
423
+ "step": 6144
424
+ },
425
+ {
426
+ "epoch": 0.2837744215047804,
427
+ "eval_acr_loss": 0.444739139651599,
428
+ "eval_across_var": 0.037298334392476575,
429
+ "eval_bleu": 0.9670970291714691,
430
+ "eval_ce_loss": 0.08759757838122649,
431
+ "eval_cos_loss": 0.34691714662122947,
432
+ "eval_cov": 0.08670684953802797,
433
+ "eval_cov_loss": 0.011868421341288307,
434
+ "eval_glb_loss": 0.0008387476386425913,
435
+ "eval_global_kurtosis": 2.998767622529644,
436
+ "eval_global_mean": -0.007462886234396669,
437
+ "eval_global_var": 1.1261125677796804,
438
+ "eval_krt_loss": 0.00038161360164841305,
439
+ "eval_loss": 0.2573624641111452,
440
+ "eval_mean_loss": 6.83625881451915e-05,
441
+ "eval_mse_loss": 0.8629201449215684,
442
+ "eval_per_loss": 0.00010140475162446005,
443
+ "eval_per_var": 1.1030652468607305,
444
+ "eval_within_var": 1.0889683330440085,
445
+ "eval_wth_loss": 2.5263947968048936e-05,
446
+ "step": 6144
447
+ },
448
+ {
449
+ "epoch": 0.2837744215047804,
450
+ "eval_acr_loss": 0.444739139651599,
451
+ "eval_across_var": 0.037298334392476575,
452
+ "eval_bleu": 0.9670970291714691,
453
+ "eval_ce_loss": 0.08759757838122649,
454
+ "eval_cos_loss": 0.34691714662122947,
455
+ "eval_cov": 0.08670684953802797,
456
+ "eval_cov_loss": 0.011868421341288307,
457
+ "eval_glb_loss": 0.0008387476386425913,
458
+ "eval_global_kurtosis": 2.998767622529644,
459
+ "eval_global_mean": -0.007462886234396669,
460
+ "eval_global_var": 1.1261125677796804,
461
+ "eval_krt_loss": 0.00038161360164841305,
462
+ "eval_loss": 0.2573624641111452,
463
+ "eval_mean_loss": 6.83625881451915e-05,
464
+ "eval_mse_loss": 0.8629201449215684,
465
+ "eval_per_loss": 0.00010140475162446005,
466
+ "eval_per_var": 1.1030652468607305,
467
+ "eval_runtime": 153.8249,
468
+ "eval_samples_per_second": 181.98,
469
+ "eval_steps_per_second": 2.847,
470
+ "eval_within_var": 1.0889683330440085,
471
+ "eval_wth_loss": 2.5263947968048936e-05,
472
+ "step": 6144
473
+ },
474
+ {
475
+ "epoch": 0.2955983557341462,
476
+ "grad_norm": 0.08428088575601578,
477
+ "learning_rate": 4.131079581886694e-05,
478
+ "loss": 0.325,
479
+ "step": 6400
480
+ },
481
+ {
482
+ "epoch": 0.30742228996351206,
483
+ "grad_norm": 0.08620952814817429,
484
+ "learning_rate": 4.057345705423016e-05,
485
+ "loss": 0.3121,
486
+ "step": 6656
487
+ },
488
+ {
489
+ "epoch": 0.3192462241928779,
490
+ "grad_norm": 0.08939851075410843,
491
+ "learning_rate": 3.981328167731251e-05,
492
+ "loss": 0.3016,
493
+ "step": 6912
494
+ },
495
+ {
496
+ "epoch": 0.3310701584222438,
497
+ "grad_norm": 0.08685296773910522,
498
+ "learning_rate": 3.9031384394391954e-05,
499
+ "loss": 0.2875,
500
+ "step": 7168
501
+ },
502
+ {
503
+ "epoch": 0.3310701584222438,
504
+ "eval_acr_loss": 0.4308900043164214,
505
+ "eval_across_var": 0.04932180811230042,
506
+ "eval_bleu": 0.9757814026870458,
507
+ "eval_ce_loss": 0.0650875303026748,
508
+ "eval_cos_loss": 0.3049790498600703,
509
+ "eval_cov": 0.0888557608269121,
510
+ "eval_cov_loss": 0.012648627446904847,
511
+ "eval_glb_loss": 0.0011189478131933299,
512
+ "eval_global_kurtosis": 3.0107862796957634,
513
+ "eval_global_mean": -0.007322731763804885,
514
+ "eval_global_var": 1.130718910530822,
515
+ "eval_krt_loss": 0.0008007638278780262,
516
+ "eval_loss": 0.217820766683855,
517
+ "eval_mean_loss": 6.851153396431181e-05,
518
+ "eval_mse_loss": 0.7866716142658774,
519
+ "eval_per_loss": 0.0001979704842660462,
520
+ "eval_per_var": 1.1079815032819635,
521
+ "eval_within_var": 1.0816122696279935,
522
+ "eval_wth_loss": 1.0462231934273432e-05,
523
+ "step": 7168
524
+ },
525
+ {
526
+ "epoch": 0.3310701584222438,
527
+ "eval_acr_loss": 0.4308900043164214,
528
+ "eval_across_var": 0.04932180811230042,
529
+ "eval_bleu": 0.9757814026870458,
530
+ "eval_ce_loss": 0.0650875303026748,
531
+ "eval_cos_loss": 0.3049790498600703,
532
+ "eval_cov": 0.0888557608269121,
533
+ "eval_cov_loss": 0.012648627446904847,
534
+ "eval_glb_loss": 0.0011189478131933299,
535
+ "eval_global_kurtosis": 3.0107862796957634,
536
+ "eval_global_mean": -0.007322731763804885,
537
+ "eval_global_var": 1.130718910530822,
538
+ "eval_krt_loss": 0.0008007638278780262,
539
+ "eval_loss": 0.217820766683855,
540
+ "eval_mean_loss": 6.851153396431181e-05,
541
+ "eval_mse_loss": 0.7866716142658774,
542
+ "eval_per_loss": 0.0001979704842660462,
543
+ "eval_per_var": 1.1079815032819635,
544
+ "eval_runtime": 151.7662,
545
+ "eval_samples_per_second": 184.448,
546
+ "eval_steps_per_second": 2.886,
547
+ "eval_within_var": 1.0816122696279935,
548
+ "eval_wth_loss": 1.0462231934273432e-05,
549
+ "step": 7168
550
+ },
551
+ {
552
+ "epoch": 0.34289409265160964,
553
+ "grad_norm": 0.08307478576898575,
554
+ "learning_rate": 3.822891176432382e-05,
555
+ "loss": 0.2798,
556
+ "step": 7424
557
+ },
558
+ {
559
+ "epoch": 0.3547180268809755,
560
+ "grad_norm": 0.07975339144468307,
561
+ "learning_rate": 3.7407040517249335e-05,
562
+ "loss": 0.2702,
563
+ "step": 7680
564
+ },
565
+ {
566
+ "epoch": 0.3665419611103413,
567
+ "grad_norm": 0.08212888985872269,
568
+ "learning_rate": 3.6566975829061614e-05,
569
+ "loss": 0.2607,
570
+ "step": 7936
571
+ },
572
+ {
573
+ "epoch": 0.37836589533970716,
574
+ "grad_norm": 0.07644044607877731,
575
+ "learning_rate": 3.5709949554159355e-05,
576
+ "loss": 0.2519,
577
+ "step": 8192
578
+ },
579
+ {
580
+ "epoch": 0.37836589533970716,
581
+ "eval_acr_loss": 0.41207800308863324,
582
+ "eval_across_var": 0.0661766531167033,
583
+ "eval_bleu": 0.9805728687340939,
584
+ "eval_ce_loss": 0.05073687865411582,
585
+ "eval_cos_loss": 0.2765354519905565,
586
+ "eval_cov": 0.09245990073844178,
587
+ "eval_cov_loss": 0.013950884125310264,
588
+ "eval_glb_loss": 0.0015832305986751307,
589
+ "eval_global_kurtosis": 3.0033102030079113,
590
+ "eval_global_mean": -0.008325645232309489,
591
+ "eval_global_var": 1.1371356842180365,
592
+ "eval_krt_loss": 0.0005490112146340372,
593
+ "eval_loss": 0.19166131559164012,
594
+ "eval_mean_loss": 8.809174024773519e-05,
595
+ "eval_mse_loss": 0.7389907115670644,
596
+ "eval_per_loss": 0.00044034952490991304,
597
+ "eval_per_var": 1.1152143086472603,
598
+ "eval_within_var": 1.0713012561406174,
599
+ "eval_wth_loss": 1.391253488354359e-06,
600
+ "step": 8192
601
+ },
602
+ {
603
+ "epoch": 0.37836589533970716,
604
+ "eval_acr_loss": 0.41207800308863324,
605
+ "eval_across_var": 0.0661766531167033,
606
+ "eval_bleu": 0.9805728687340939,
607
+ "eval_ce_loss": 0.05073687865411582,
608
+ "eval_cos_loss": 0.2765354519905565,
609
+ "eval_cov": 0.09245990073844178,
610
+ "eval_cov_loss": 0.013950884125310264,
611
+ "eval_glb_loss": 0.0015832305986751307,
612
+ "eval_global_kurtosis": 3.0033102030079113,
613
+ "eval_global_mean": -0.008325645232309489,
614
+ "eval_global_var": 1.1371356842180365,
615
+ "eval_krt_loss": 0.0005490112146340372,
616
+ "eval_loss": 0.19166131559164012,
617
+ "eval_mean_loss": 8.809174024773519e-05,
618
+ "eval_mse_loss": 0.7389907115670644,
619
+ "eval_per_loss": 0.00044034952490991304,
620
+ "eval_per_var": 1.1152143086472603,
621
+ "eval_runtime": 151.0158,
622
+ "eval_samples_per_second": 185.365,
623
+ "eval_steps_per_second": 2.9,
624
+ "eval_within_var": 1.0713012561406174,
625
+ "eval_wth_loss": 1.391253488354359e-06,
626
+ "step": 8192
627
+ },
628
+ {
629
+ "epoch": 0.390189829569073,
630
+ "grad_norm": 0.07752202451229095,
631
+ "learning_rate": 3.483721841907964e-05,
632
+ "loss": 0.2462,
633
+ "step": 8448
634
+ },
635
+ {
636
+ "epoch": 0.4020137637984389,
637
+ "grad_norm": 0.08402163535356522,
638
+ "learning_rate": 3.3953554020219734e-05,
639
+ "loss": 0.2397,
640
+ "step": 8704
641
+ },
642
+ {
643
+ "epoch": 0.41383769802780473,
644
+ "grad_norm": 0.08829676359891891,
645
+ "learning_rate": 3.305332229492516e-05,
646
+ "loss": 0.2323,
647
+ "step": 8960
648
+ },
649
+ {
650
+ "epoch": 0.4256616322571706,
651
+ "grad_norm": 0.07489398866891861,
652
+ "learning_rate": 3.214128133561262e-05,
653
+ "loss": 0.2275,
654
+ "step": 9216
655
+ },
656
+ {
657
+ "epoch": 0.4256616322571706,
658
+ "eval_acr_loss": 0.3899558907094067,
659
+ "eval_across_var": 0.08639241476037186,
660
+ "eval_bleu": 0.9839023939205267,
661
+ "eval_ce_loss": 0.04116568632730066,
662
+ "eval_cos_loss": 0.25615145446366916,
663
+ "eval_cov": 0.09534583244149543,
664
+ "eval_cov_loss": 0.015230727917833687,
665
+ "eval_glb_loss": 0.001839591442792664,
666
+ "eval_global_kurtosis": 3.0162572811727655,
667
+ "eval_global_mean": -0.008430217362974333,
668
+ "eval_global_var": 1.1393898045091324,
669
+ "eval_krt_loss": 0.001037847468692951,
670
+ "eval_loss": 0.17338106053196675,
671
+ "eval_mean_loss": 9.661306929678656e-05,
672
+ "eval_mse_loss": 0.7077833269829075,
673
+ "eval_per_loss": 0.0006317324198246723,
674
+ "eval_per_var": 1.1172164847317352,
675
+ "eval_within_var": 1.053544073616533,
676
+ "eval_wth_loss": 0.0,
677
+ "step": 9216
678
+ },
679
+ {
680
+ "epoch": 0.4256616322571706,
681
+ "eval_acr_loss": 0.3899558907094067,
682
+ "eval_across_var": 0.08639241476037186,
683
+ "eval_bleu": 0.9839023939205267,
684
+ "eval_ce_loss": 0.04116568632730066,
685
+ "eval_cos_loss": 0.25615145446366916,
686
+ "eval_cov": 0.09534583244149543,
687
+ "eval_cov_loss": 0.015230727917833687,
688
+ "eval_glb_loss": 0.001839591442792664,
689
+ "eval_global_kurtosis": 3.0162572811727655,
690
+ "eval_global_mean": -0.008430217362974333,
691
+ "eval_global_var": 1.1393898045091324,
692
+ "eval_krt_loss": 0.001037847468692951,
693
+ "eval_loss": 0.17338106053196675,
694
+ "eval_mean_loss": 9.661306929678656e-05,
695
+ "eval_mse_loss": 0.7077833269829075,
696
+ "eval_per_loss": 0.0006317324198246723,
697
+ "eval_per_var": 1.1172164847317352,
698
+ "eval_runtime": 149.7127,
699
+ "eval_samples_per_second": 186.978,
700
+ "eval_steps_per_second": 2.926,
701
+ "eval_within_var": 1.053544073616533,
702
+ "eval_wth_loss": 0.0,
703
+ "step": 9216
704
+ },
705
+ {
706
+ "epoch": 0.4374855664865364,
707
+ "grad_norm": 0.07958604395389557,
708
+ "learning_rate": 3.1218768541274476e-05,
709
+ "loss": 0.2225,
710
+ "step": 9472
711
+ },
712
+ {
713
+ "epoch": 0.44930950071590225,
714
+ "grad_norm": 0.08125226199626923,
715
+ "learning_rate": 3.028713666659683e-05,
716
+ "loss": 0.219,
717
+ "step": 9728
718
+ },
719
+ {
720
+ "epoch": 0.4611334349452681,
721
+ "grad_norm": 0.08044584840536118,
722
+ "learning_rate": 2.9347751838306454e-05,
723
+ "loss": 0.2136,
724
+ "step": 9984
725
+ },
726
+ {
727
+ "epoch": 0.472957369174634,
728
+ "grad_norm": 0.09431233257055283,
729
+ "learning_rate": 2.840199155190943e-05,
730
+ "loss": 0.2109,
731
+ "step": 10240
732
+ },
733
+ {
734
+ "epoch": 0.472957369174634,
735
+ "eval_acr_loss": 0.371521062094327,
736
+ "eval_across_var": 0.10420215363746092,
737
+ "eval_bleu": 0.9864977390539889,
738
+ "eval_ce_loss": 0.03421062290813076,
739
+ "eval_cos_loss": 0.24110814265600622,
740
+ "eval_cov": 0.09938139893692922,
741
+ "eval_cov_loss": 0.016782419545550462,
742
+ "eval_glb_loss": 0.0025077612427728364,
743
+ "eval_global_kurtosis": 3.000300804229632,
744
+ "eval_global_mean": -0.008482733817949686,
745
+ "eval_global_var": 1.1459448130707763,
746
+ "eval_krt_loss": 0.0005616036675252787,
747
+ "eval_loss": 0.16001523779407484,
748
+ "eval_mean_loss": 0.00010505710818644507,
749
+ "eval_mse_loss": 0.6870357463621113,
750
+ "eval_per_loss": 0.0010411720005544605,
751
+ "eval_per_var": 1.1233344927226028,
752
+ "eval_within_var": 1.042447058849683,
753
+ "eval_wth_loss": 0.0,
754
+ "step": 10240
755
+ },
756
+ {
757
+ "epoch": 0.472957369174634,
758
+ "eval_acr_loss": 0.371521062094327,
759
+ "eval_across_var": 0.10420215363746092,
760
+ "eval_bleu": 0.9864977390539889,
761
+ "eval_ce_loss": 0.03421062290813076,
762
+ "eval_cos_loss": 0.24110814265600622,
763
+ "eval_cov": 0.09938139893692922,
764
+ "eval_cov_loss": 0.016782419545550462,
765
+ "eval_glb_loss": 0.0025077612427728364,
766
+ "eval_global_kurtosis": 3.000300804229632,
767
+ "eval_global_mean": -0.008482733817949686,
768
+ "eval_global_var": 1.1459448130707763,
769
+ "eval_krt_loss": 0.0005616036675252787,
770
+ "eval_loss": 0.16001523779407484,
771
+ "eval_mean_loss": 0.00010505710818644507,
772
+ "eval_mse_loss": 0.6870357463621113,
773
+ "eval_per_loss": 0.0010411720005544605,
774
+ "eval_per_var": 1.1233344927226028,
775
+ "eval_runtime": 150.7627,
776
+ "eval_samples_per_second": 185.676,
777
+ "eval_steps_per_second": 2.905,
778
+ "eval_within_var": 1.042447058849683,
779
+ "eval_wth_loss": 0.0,
780
+ "step": 10240
781
+ },
782
+ {
783
+ "epoch": 0.48478130340399983,
784
+ "grad_norm": 0.07165589183568954,
785
+ "learning_rate": 2.745124265175868e-05,
786
+ "loss": 0.2062,
787
+ "step": 10496
788
+ },
789
+ {
790
+ "epoch": 0.49660523763336567,
791
+ "grad_norm": 0.07731898874044418,
792
+ "learning_rate": 2.6496899297412598e-05,
793
+ "loss": 0.2023,
794
+ "step": 10752
795
+ },
796
+ {
797
+ "epoch": 0.5084291718627315,
798
+ "grad_norm": 0.08252639323472977,
799
+ "learning_rate": 2.5544099852682395e-05,
800
+ "loss": 0.2011,
801
+ "step": 11008
802
+ },
803
+ {
804
+ "epoch": 0.5202531060920974,
805
+ "grad_norm": 0.09438033401966095,
806
+ "learning_rate": 2.4586769464065254e-05,
807
+ "loss": 0.1971,
808
+ "step": 11264
809
+ },
810
+ {
811
+ "epoch": 0.5202531060920974,
812
+ "eval_acr_loss": 0.3526782588735563,
813
+ "eval_across_var": 0.1233543821240533,
814
+ "eval_bleu": 0.9885170657049208,
815
+ "eval_ce_loss": 0.02941183759256789,
816
+ "eval_cos_loss": 0.2296683646432341,
817
+ "eval_cov": 0.10448926115689212,
818
+ "eval_cov_loss": 0.018818640007667208,
819
+ "eval_glb_loss": 0.0035778092502643057,
820
+ "eval_global_kurtosis": 3.010114621898355,
821
+ "eval_global_mean": -0.011101974472063318,
822
+ "eval_global_var": 1.1552310751997716,
823
+ "eval_krt_loss": 0.0011141615731220227,
824
+ "eval_loss": 0.15028831549839342,
825
+ "eval_mean_loss": 0.0001692649679361995,
826
+ "eval_mse_loss": 0.6726388316176254,
827
+ "eval_per_loss": 0.0018812230319723378,
828
+ "eval_per_var": 1.1340878281963471,
829
+ "eval_within_var": 1.0327191478041209,
830
+ "eval_wth_loss": 0.0,
831
+ "step": 11264
832
+ },
833
+ {
834
+ "epoch": 0.5202531060920974,
835
+ "eval_acr_loss": 0.3526782588735563,
836
+ "eval_across_var": 0.1233543821240533,
837
+ "eval_bleu": 0.9885170657049208,
838
+ "eval_ce_loss": 0.02941183759256789,
839
+ "eval_cos_loss": 0.2296683646432341,
840
+ "eval_cov": 0.10448926115689212,
841
+ "eval_cov_loss": 0.018818640007667208,
842
+ "eval_glb_loss": 0.0035778092502643057,
843
+ "eval_global_kurtosis": 3.010114621898355,
844
+ "eval_global_mean": -0.011101974472063318,
845
+ "eval_global_var": 1.1552310751997716,
846
+ "eval_krt_loss": 0.0011141615731220227,
847
+ "eval_loss": 0.15028831549839342,
848
+ "eval_mean_loss": 0.0001692649679361995,
849
+ "eval_mse_loss": 0.6726388316176254,
850
+ "eval_per_loss": 0.0018812230319723378,
851
+ "eval_per_var": 1.1340878281963471,
852
+ "eval_runtime": 150.1371,
853
+ "eval_samples_per_second": 186.45,
854
+ "eval_steps_per_second": 2.917,
855
+ "eval_within_var": 1.0327191478041209,
856
+ "eval_wth_loss": 0.0,
857
+ "step": 11264
858
+ },
859
+ {
860
+ "epoch": 0.5320770403214632,
861
+ "grad_norm": 0.08242896944284439,
862
+ "learning_rate": 2.3630045028609248e-05,
863
+ "loss": 0.1941,
864
+ "step": 11520
865
+ },
866
+ {
867
+ "epoch": 0.5439009745508291,
868
+ "grad_norm": 0.12015263736248016,
869
+ "learning_rate": 2.267532946828065e-05,
870
+ "loss": 0.1923,
871
+ "step": 11776
872
+ },
873
+ {
874
+ "epoch": 0.5557249087801949,
875
+ "grad_norm": 0.07551924884319305,
876
+ "learning_rate": 2.1724022759270597e-05,
877
+ "loss": 0.19,
878
+ "step": 12032
879
+ },
880
+ {
881
+ "epoch": 0.5675488430095608,
882
+ "grad_norm": 0.09451667219400406,
883
+ "learning_rate": 2.0777519879097458e-05,
884
+ "loss": 0.1889,
885
+ "step": 12288
886
+ },
887
+ {
888
+ "epoch": 0.5675488430095608,
889
+ "eval_acr_loss": 0.34067982850281614,
890
+ "eval_across_var": 0.1346761514964305,
891
+ "eval_bleu": 0.9895254511324281,
892
+ "eval_ce_loss": 0.0263222230802456,
893
+ "eval_cos_loss": 0.22149913948556604,
894
+ "eval_cov": 0.10440648745184075,
895
+ "eval_cov_loss": 0.019090854430987955,
896
+ "eval_glb_loss": 0.0031285923291081673,
897
+ "eval_global_kurtosis": 3.0161606329216806,
898
+ "eval_global_mean": -0.01319564518318873,
899
+ "eval_global_var": 1.1500695633561644,
900
+ "eval_krt_loss": 0.0014608212088625325,
901
+ "eval_loss": 0.14371125602490825,
902
+ "eval_mean_loss": 0.0002307435466704484,
903
+ "eval_mse_loss": 0.6642545581408287,
904
+ "eval_per_loss": 0.00148715251082838,
905
+ "eval_per_var": 1.1267546910673516,
906
+ "eval_within_var": 1.0164183696655378,
907
+ "eval_wth_loss": 0.0,
908
+ "step": 12288
909
+ },
910
+ {
911
+ "epoch": 0.5675488430095608,
912
+ "eval_acr_loss": 0.34067982850281614,
913
+ "eval_across_var": 0.1346761514964305,
914
+ "eval_bleu": 0.9895254511324281,
915
+ "eval_ce_loss": 0.0263222230802456,
916
+ "eval_cos_loss": 0.22149913948556604,
917
+ "eval_cov": 0.10440648745184075,
918
+ "eval_cov_loss": 0.019090854430987955,
919
+ "eval_glb_loss": 0.0031285923291081673,
920
+ "eval_global_kurtosis": 3.0161606329216806,
921
+ "eval_global_mean": -0.01319564518318873,
922
+ "eval_global_var": 1.1500695633561644,
923
+ "eval_krt_loss": 0.0014608212088625325,
924
+ "eval_loss": 0.14371125602490825,
925
+ "eval_mean_loss": 0.0002307435466704484,
926
+ "eval_mse_loss": 0.6642545581408287,
927
+ "eval_per_loss": 0.00148715251082838,
928
+ "eval_per_var": 1.1267546910673516,
929
+ "eval_runtime": 149.6169,
930
+ "eval_samples_per_second": 187.098,
931
+ "eval_steps_per_second": 2.927,
932
+ "eval_within_var": 1.0164183696655378,
933
+ "eval_wth_loss": 0.0,
934
+ "step": 12288
935
+ }
936
+ ],
937
+ "logging_steps": 256,
938
+ "max_steps": 21651,
939
+ "num_input_tokens_seen": 0,
940
+ "num_train_epochs": 1,
941
+ "save_steps": 1024,
942
+ "stateful_callbacks": {
943
+ "TrainerControl": {
944
+ "args": {
945
+ "should_epoch_stop": false,
946
+ "should_evaluate": false,
947
+ "should_log": false,
948
+ "should_save": true,
949
+ "should_training_stop": false
950
+ },
951
+ "attributes": {}
952
+ }
953
+ },
954
+ "total_flos": 0.0,
955
+ "train_batch_size": 64,
956
+ "trial_name": null,
957
+ "trial_params": null
958
+ }
checkpoints-v2.6/checkpoint-12288/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae70ca1441ea90d34c54e8c897a025ccc9b6d942bad37cfab215bb440b2ecd4d
3
+ size 5777