Tucano2-0.6B-Base / evals.yaml
nicholasKluge's picture
Upload folder using huggingface_hub
bf5080b verified
evaluations:
arc_challenge_poly_pt_acc: 0.33247863247863246
arc_challenge_poly_pt_acc_norm: 0.3700854700854701
arc_challenge_poly_pt_acc_norm_stderr: 0.014121621753736067
arc_challenge_poly_pt_acc_stderr: 0.013778666871508503
arc_challenge_poly_pt_alias: arc_challenge_poly_pt
assin2_rte_acc,all: 0.5130718954248366
assin2_rte_acc_stderr,all: 0.007136287055837267
assin2_rte_alias: assin2_rte
assin2_rte_f1_macro,all: 0.36891442630353705
assin2_rte_f1_macro_stderr,all: 0.004995590407601964
assin2_sts_alias: assin2_sts
assin2_sts_mse,all: 1.6449387254901962
assin2_sts_mse_stderr,all: N/A
assin2_sts_pearson,all: 0.10122529198593108
assin2_sts_pearson_stderr,all: 0.013509459447594846
assin_entailment_acc: 0.66825
assin_entailment_acc_stderr: 0.0074455922253301296
assin_entailment_alias: assin_entailment
assin_paraphrase_acc: 0.68
assin_paraphrase_acc_stderr: 0.00737655769318252
assin_paraphrase_alias: assin_paraphrase
belebele_por_Latn_acc: 0.26222222222222225
belebele_por_Latn_acc_norm: 0.26222222222222225
belebele_por_Latn_acc_norm_stderr: 0.014669580202217824
belebele_por_Latn_acc_stderr: 0.014669580202217824
belebele_por_Latn_alias: belebele_por_Latn
bluex_acc,all: 0.2114047287899861
bluex_acc,exam_id__UNICAMP_2018: 0.2222222222222222
bluex_acc,exam_id__UNICAMP_2019: 0.12
bluex_acc,exam_id__UNICAMP_2020: 0.2545454545454545
bluex_acc,exam_id__UNICAMP_2021_1: 0.34782608695652173
bluex_acc,exam_id__UNICAMP_2021_2: 0.21568627450980393
bluex_acc,exam_id__UNICAMP_2022: 0.3076923076923077
bluex_acc,exam_id__UNICAMP_2023: 0.3023255813953488
bluex_acc,exam_id__UNICAMP_2024: 0.2222222222222222
bluex_acc,exam_id__USP_2018: 0.14814814814814814
bluex_acc,exam_id__USP_2019: 0.25
bluex_acc,exam_id__USP_2020: 0.21428571428571427
bluex_acc,exam_id__USP_2021: 0.21153846153846154
bluex_acc,exam_id__USP_2022: 0.12244897959183673
bluex_acc,exam_id__USP_2023: 0.11363636363636363
bluex_acc,exam_id__USP_2024: 0.14634146341463414
bluex_acc_stderr,all: 0.008792062892074834
bluex_acc_stderr,exam_id__UNICAMP_2018: 0.032678043004061255
bluex_acc_stderr,exam_id__UNICAMP_2019: 0.02657623294954054
bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03387860333526782
bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.040525589885644496
bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.033227440336532375
bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04256832900072988
bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04033585396483944
bluex_acc_stderr,exam_id__UNICAMP_2024: 0.035722618677772114
bluex_acc_stderr,exam_id__USP_2018: 0.02788267879417696
bluex_acc_stderr,exam_id__USP_2019: 0.03952518565922328
bluex_acc_stderr,exam_id__USP_2020: 0.03163824199119833
bluex_acc_stderr,exam_id__USP_2021: 0.03275051381208595
bluex_acc_stderr,exam_id__USP_2022: 0.027021164094854087
bluex_acc_stderr,exam_id__USP_2023: 0.02764738708665832
bluex_acc_stderr,exam_id__USP_2024: 0.031931812220256435
bluex_alias: bluex
calame_pt_acc: 0.5761078998073218
calame_pt_acc_stderr: 0.010848520804992096
calame_pt_alias: calame_pt
calame_pt_perplexity: 7.505161823615618
calame_pt_perplexity_stderr: 0.44787132592301543
enem_challenge_acc,all: 0.2358292512246326
enem_challenge_acc,exam_id__2009: 0.19130434782608696
enem_challenge_acc,exam_id__2010: 0.20512820512820512
enem_challenge_acc,exam_id__2011: 0.2564102564102564
enem_challenge_acc,exam_id__2012: 0.20689655172413793
enem_challenge_acc,exam_id__2013: 0.3055555555555556
enem_challenge_acc,exam_id__2014: 0.22935779816513763
enem_challenge_acc,exam_id__2015: 0.19327731092436976
enem_challenge_acc,exam_id__2016: 0.24793388429752067
enem_challenge_acc,exam_id__2016_2: 0.2601626016260163
enem_challenge_acc,exam_id__2017: 0.2413793103448276
enem_challenge_acc,exam_id__2022: 0.2556390977443609
enem_challenge_acc,exam_id__2023: 0.23703703703703705
enem_challenge_acc_stderr,all: 0.006488970481818106
enem_challenge_acc_stderr,exam_id__2009: 0.021162122076564756
enem_challenge_acc_stderr,exam_id__2010: 0.02161251982325149
enem_challenge_acc_stderr,exam_id__2011: 0.02328408263989128
enem_challenge_acc_stderr,exam_id__2012: 0.02169743606814018
enem_challenge_acc_stderr,exam_id__2013: 0.02555190327825207
enem_challenge_acc_stderr,exam_id__2014: 0.023214349368781237
enem_challenge_acc_stderr,exam_id__2015: 0.020916129450196143
enem_challenge_acc_stderr,exam_id__2016: 0.022637164962996188
enem_challenge_acc_stderr,exam_id__2016_2: 0.02291364100620369
enem_challenge_acc_stderr,exam_id__2017: 0.022839835315721715
enem_challenge_acc_stderr,exam_id__2022: 0.021748044233065218
enem_challenge_acc_stderr,exam_id__2023: 0.021173762287121492
enem_challenge_alias: enem
faquad_nli_acc,all: 0.7846153846153846
faquad_nli_acc_stderr,all: 0.011396120309131366
faquad_nli_alias: faquad_nli
faquad_nli_f1_macro,all: 0.4396551724137931
faquad_nli_f1_macro_stderr,all: 0.00357969847290883
global_piqa_completions_por_latn_braz_acc: 0.76
global_piqa_completions_por_latn_braz_acc_bytes: 0.81
global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.039427724440366255
global_piqa_completions_por_latn_braz_acc_norm: 0.79
global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.040936018074033236
global_piqa_completions_por_latn_braz_acc_stderr: 0.04292346959909278
global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz
hatebr_offensive_acc,all: 0.4992857142857143
hatebr_offensive_acc_stderr,all: 0.00947482206123735
hatebr_offensive_alias: hatebr_offensive_binary
hatebr_offensive_f1_macro,all: 0.3330157217722725
hatebr_offensive_f1_macro_stderr,all: 0.004215753402721677
hellaswag_poly_pt_acc: 0.3724130458337848
hellaswag_poly_pt_acc_norm: 0.4774081698992307
hellaswag_poly_pt_acc_norm_stderr: 0.005199629971795624
hellaswag_poly_pt_acc_stderr: 0.005032636971394432
hellaswag_poly_pt_alias: hellaswag_poly_pt
lambada_poly_pt_acc: 0.39452745973219483
lambada_poly_pt_acc_stderr: 0.006809228632878824
lambada_poly_pt_alias: lambada_poly_pt
lambada_poly_pt_perplexity: 20.301981926552738
lambada_poly_pt_perplexity_stderr: 0.7092062816226157
mmlu_poly_pt_acc: 0.2718402882017412
mmlu_poly_pt_acc_stderr: 0.003854510385561605
mmlu_poly_pt_alias: mmlu_poly_pt
oab_exams_acc,all: 0.23280182232346242
oab_exams_acc,exam_id__2010-01: 0.25882352941176473
oab_exams_acc,exam_id__2010-02: 0.25
oab_exams_acc,exam_id__2011-03: 0.25252525252525254
oab_exams_acc,exam_id__2011-04: 0.275
oab_exams_acc,exam_id__2011-05: 0.25
oab_exams_acc,exam_id__2012-06: 0.2375
oab_exams_acc,exam_id__2012-06a: 0.25
oab_exams_acc,exam_id__2012-07: 0.15
oab_exams_acc,exam_id__2012-08: 0.2375
oab_exams_acc,exam_id__2012-09: 0.23376623376623376
oab_exams_acc,exam_id__2013-10: 0.2125
oab_exams_acc,exam_id__2013-11: 0.1875
oab_exams_acc,exam_id__2013-12: 0.1875
oab_exams_acc,exam_id__2014-13: 0.2125
oab_exams_acc,exam_id__2014-14: 0.275
oab_exams_acc,exam_id__2014-15: 0.21794871794871795
oab_exams_acc,exam_id__2015-16: 0.225
oab_exams_acc,exam_id__2015-17: 0.24358974358974358
oab_exams_acc,exam_id__2015-18: 0.2375
oab_exams_acc,exam_id__2016-19: 0.20512820512820512
oab_exams_acc,exam_id__2016-20: 0.2375
oab_exams_acc,exam_id__2016-20a: 0.275
oab_exams_acc,exam_id__2016-21: 0.2
oab_exams_acc,exam_id__2017-22: 0.2375
oab_exams_acc,exam_id__2017-23: 0.2125
oab_exams_acc,exam_id__2017-24: 0.225
oab_exams_acc,exam_id__2018-25: 0.2875
oab_exams_acc_stderr,all: 0.005211375619176454
oab_exams_acc_stderr,exam_id__2010-01: 0.027439944269300902
oab_exams_acc_stderr,exam_id__2010-02: 0.024999054164627513
oab_exams_acc_stderr,exam_id__2011-03: 0.02513759836912599
oab_exams_acc_stderr,exam_id__2011-04: 0.028842769768432266
oab_exams_acc_stderr,exam_id__2011-05: 0.027803429899727953
oab_exams_acc_stderr,exam_id__2012-06: 0.027511514707501687
oab_exams_acc_stderr,exam_id__2012-06a: 0.027969270060875333
oab_exams_acc_stderr,exam_id__2012-07: 0.022965539047482306
oab_exams_acc_stderr,exam_id__2012-08: 0.02739801259080428
oab_exams_acc_stderr,exam_id__2012-09: 0.027845780352732368
oab_exams_acc_stderr,exam_id__2013-10: 0.02636327356327225
oab_exams_acc_stderr,exam_id__2013-11: 0.025163741964027368
oab_exams_acc_stderr,exam_id__2013-12: 0.025170557738075854
oab_exams_acc_stderr,exam_id__2014-13: 0.026432573121099534
oab_exams_acc_stderr,exam_id__2014-14: 0.028840484584019492
oab_exams_acc_stderr,exam_id__2014-15: 0.02693219019792365
oab_exams_acc_stderr,exam_id__2015-16: 0.026872817692190045
oab_exams_acc_stderr,exam_id__2015-17: 0.028004812947648394
oab_exams_acc_stderr,exam_id__2015-18: 0.027398518414074452
oab_exams_acc_stderr,exam_id__2016-19: 0.026437389551520193
oab_exams_acc_stderr,exam_id__2016-20: 0.02754894506133313
oab_exams_acc_stderr,exam_id__2016-20a: 0.02874922000308674
oab_exams_acc_stderr,exam_id__2016-21: 0.025822372019185823
oab_exams_acc_stderr,exam_id__2017-22: 0.027404924686794917
oab_exams_acc_stderr,exam_id__2017-23: 0.026461647581670692
oab_exams_acc_stderr,exam_id__2017-24: 0.026969863667221566
oab_exams_acc_stderr,exam_id__2018-25: 0.029244778204497373
oab_exams_alias: oab_exams
portuguese_hate_speech_acc,all: 0.6439482961222092
portuguese_hate_speech_acc_stderr,all: 0.01158607666681163
portuguese_hate_speech_alias: portuguese_hate_speech_binary
portuguese_hate_speech_f1_macro,all: 0.44683000920330285
portuguese_hate_speech_f1_macro_stderr,all: 0.010246030132924607
tweetsentbr_acc,all: 0.2880597014925373
tweetsentbr_acc_stderr,all: 0.007133180251905251
tweetsentbr_alias: tweetsentbr
tweetsentbr_f1_macro,all: 0.20570459469384028
tweetsentbr_f1_macro_stderr,all: 0.0054860743968753725
step: 195000