evaluations: arc_challenge_poly_pt_acc: 0.33247863247863246 arc_challenge_poly_pt_acc_norm: 0.3700854700854701 arc_challenge_poly_pt_acc_norm_stderr: 0.014121621753736067 arc_challenge_poly_pt_acc_stderr: 0.013778666871508503 arc_challenge_poly_pt_alias: arc_challenge_poly_pt assin2_rte_acc,all: 0.5130718954248366 assin2_rte_acc_stderr,all: 0.007136287055837267 assin2_rte_alias: assin2_rte assin2_rte_f1_macro,all: 0.36891442630353705 assin2_rte_f1_macro_stderr,all: 0.004995590407601964 assin2_sts_alias: assin2_sts assin2_sts_mse,all: 1.6449387254901962 assin2_sts_mse_stderr,all: N/A assin2_sts_pearson,all: 0.10122529198593108 assin2_sts_pearson_stderr,all: 0.013509459447594846 assin_entailment_acc: 0.66825 assin_entailment_acc_stderr: 0.0074455922253301296 assin_entailment_alias: assin_entailment assin_paraphrase_acc: 0.68 assin_paraphrase_acc_stderr: 0.00737655769318252 assin_paraphrase_alias: assin_paraphrase belebele_por_Latn_acc: 0.26222222222222225 belebele_por_Latn_acc_norm: 0.26222222222222225 belebele_por_Latn_acc_norm_stderr: 0.014669580202217824 belebele_por_Latn_acc_stderr: 0.014669580202217824 belebele_por_Latn_alias: belebele_por_Latn bluex_acc,all: 0.2114047287899861 bluex_acc,exam_id__UNICAMP_2018: 0.2222222222222222 bluex_acc,exam_id__UNICAMP_2019: 0.12 bluex_acc,exam_id__UNICAMP_2020: 0.2545454545454545 bluex_acc,exam_id__UNICAMP_2021_1: 0.34782608695652173 bluex_acc,exam_id__UNICAMP_2021_2: 0.21568627450980393 bluex_acc,exam_id__UNICAMP_2022: 0.3076923076923077 bluex_acc,exam_id__UNICAMP_2023: 0.3023255813953488 bluex_acc,exam_id__UNICAMP_2024: 0.2222222222222222 bluex_acc,exam_id__USP_2018: 0.14814814814814814 bluex_acc,exam_id__USP_2019: 0.25 bluex_acc,exam_id__USP_2020: 0.21428571428571427 bluex_acc,exam_id__USP_2021: 0.21153846153846154 bluex_acc,exam_id__USP_2022: 0.12244897959183673 bluex_acc,exam_id__USP_2023: 0.11363636363636363 bluex_acc,exam_id__USP_2024: 0.14634146341463414 bluex_acc_stderr,all: 0.008792062892074834 bluex_acc_stderr,exam_id__UNICAMP_2018: 0.032678043004061255 bluex_acc_stderr,exam_id__UNICAMP_2019: 0.02657623294954054 bluex_acc_stderr,exam_id__UNICAMP_2020: 0.03387860333526782 bluex_acc_stderr,exam_id__UNICAMP_2021_1: 0.040525589885644496 bluex_acc_stderr,exam_id__UNICAMP_2021_2: 0.033227440336532375 bluex_acc_stderr,exam_id__UNICAMP_2022: 0.04256832900072988 bluex_acc_stderr,exam_id__UNICAMP_2023: 0.04033585396483944 bluex_acc_stderr,exam_id__UNICAMP_2024: 0.035722618677772114 bluex_acc_stderr,exam_id__USP_2018: 0.02788267879417696 bluex_acc_stderr,exam_id__USP_2019: 0.03952518565922328 bluex_acc_stderr,exam_id__USP_2020: 0.03163824199119833 bluex_acc_stderr,exam_id__USP_2021: 0.03275051381208595 bluex_acc_stderr,exam_id__USP_2022: 0.027021164094854087 bluex_acc_stderr,exam_id__USP_2023: 0.02764738708665832 bluex_acc_stderr,exam_id__USP_2024: 0.031931812220256435 bluex_alias: bluex calame_pt_acc: 0.5761078998073218 calame_pt_acc_stderr: 0.010848520804992096 calame_pt_alias: calame_pt calame_pt_perplexity: 7.505161823615618 calame_pt_perplexity_stderr: 0.44787132592301543 enem_challenge_acc,all: 0.2358292512246326 enem_challenge_acc,exam_id__2009: 0.19130434782608696 enem_challenge_acc,exam_id__2010: 0.20512820512820512 enem_challenge_acc,exam_id__2011: 0.2564102564102564 enem_challenge_acc,exam_id__2012: 0.20689655172413793 enem_challenge_acc,exam_id__2013: 0.3055555555555556 enem_challenge_acc,exam_id__2014: 0.22935779816513763 enem_challenge_acc,exam_id__2015: 0.19327731092436976 enem_challenge_acc,exam_id__2016: 0.24793388429752067 enem_challenge_acc,exam_id__2016_2: 0.2601626016260163 enem_challenge_acc,exam_id__2017: 0.2413793103448276 enem_challenge_acc,exam_id__2022: 0.2556390977443609 enem_challenge_acc,exam_id__2023: 0.23703703703703705 enem_challenge_acc_stderr,all: 0.006488970481818106 enem_challenge_acc_stderr,exam_id__2009: 0.021162122076564756 enem_challenge_acc_stderr,exam_id__2010: 0.02161251982325149 enem_challenge_acc_stderr,exam_id__2011: 0.02328408263989128 enem_challenge_acc_stderr,exam_id__2012: 0.02169743606814018 enem_challenge_acc_stderr,exam_id__2013: 0.02555190327825207 enem_challenge_acc_stderr,exam_id__2014: 0.023214349368781237 enem_challenge_acc_stderr,exam_id__2015: 0.020916129450196143 enem_challenge_acc_stderr,exam_id__2016: 0.022637164962996188 enem_challenge_acc_stderr,exam_id__2016_2: 0.02291364100620369 enem_challenge_acc_stderr,exam_id__2017: 0.022839835315721715 enem_challenge_acc_stderr,exam_id__2022: 0.021748044233065218 enem_challenge_acc_stderr,exam_id__2023: 0.021173762287121492 enem_challenge_alias: enem faquad_nli_acc,all: 0.7846153846153846 faquad_nli_acc_stderr,all: 0.011396120309131366 faquad_nli_alias: faquad_nli faquad_nli_f1_macro,all: 0.4396551724137931 faquad_nli_f1_macro_stderr,all: 0.00357969847290883 global_piqa_completions_por_latn_braz_acc: 0.76 global_piqa_completions_por_latn_braz_acc_bytes: 0.81 global_piqa_completions_por_latn_braz_acc_bytes_stderr: 0.039427724440366255 global_piqa_completions_por_latn_braz_acc_norm: 0.79 global_piqa_completions_por_latn_braz_acc_norm_stderr: 0.040936018074033236 global_piqa_completions_por_latn_braz_acc_stderr: 0.04292346959909278 global_piqa_completions_por_latn_braz_alias: global_piqa_completions_por_latn_braz hatebr_offensive_acc,all: 0.4992857142857143 hatebr_offensive_acc_stderr,all: 0.00947482206123735 hatebr_offensive_alias: hatebr_offensive_binary hatebr_offensive_f1_macro,all: 0.3330157217722725 hatebr_offensive_f1_macro_stderr,all: 0.004215753402721677 hellaswag_poly_pt_acc: 0.3724130458337848 hellaswag_poly_pt_acc_norm: 0.4774081698992307 hellaswag_poly_pt_acc_norm_stderr: 0.005199629971795624 hellaswag_poly_pt_acc_stderr: 0.005032636971394432 hellaswag_poly_pt_alias: hellaswag_poly_pt lambada_poly_pt_acc: 0.39452745973219483 lambada_poly_pt_acc_stderr: 0.006809228632878824 lambada_poly_pt_alias: lambada_poly_pt lambada_poly_pt_perplexity: 20.301981926552738 lambada_poly_pt_perplexity_stderr: 0.7092062816226157 mmlu_poly_pt_acc: 0.2718402882017412 mmlu_poly_pt_acc_stderr: 0.003854510385561605 mmlu_poly_pt_alias: mmlu_poly_pt oab_exams_acc,all: 0.23280182232346242 oab_exams_acc,exam_id__2010-01: 0.25882352941176473 oab_exams_acc,exam_id__2010-02: 0.25 oab_exams_acc,exam_id__2011-03: 0.25252525252525254 oab_exams_acc,exam_id__2011-04: 0.275 oab_exams_acc,exam_id__2011-05: 0.25 oab_exams_acc,exam_id__2012-06: 0.2375 oab_exams_acc,exam_id__2012-06a: 0.25 oab_exams_acc,exam_id__2012-07: 0.15 oab_exams_acc,exam_id__2012-08: 0.2375 oab_exams_acc,exam_id__2012-09: 0.23376623376623376 oab_exams_acc,exam_id__2013-10: 0.2125 oab_exams_acc,exam_id__2013-11: 0.1875 oab_exams_acc,exam_id__2013-12: 0.1875 oab_exams_acc,exam_id__2014-13: 0.2125 oab_exams_acc,exam_id__2014-14: 0.275 oab_exams_acc,exam_id__2014-15: 0.21794871794871795 oab_exams_acc,exam_id__2015-16: 0.225 oab_exams_acc,exam_id__2015-17: 0.24358974358974358 oab_exams_acc,exam_id__2015-18: 0.2375 oab_exams_acc,exam_id__2016-19: 0.20512820512820512 oab_exams_acc,exam_id__2016-20: 0.2375 oab_exams_acc,exam_id__2016-20a: 0.275 oab_exams_acc,exam_id__2016-21: 0.2 oab_exams_acc,exam_id__2017-22: 0.2375 oab_exams_acc,exam_id__2017-23: 0.2125 oab_exams_acc,exam_id__2017-24: 0.225 oab_exams_acc,exam_id__2018-25: 0.2875 oab_exams_acc_stderr,all: 0.005211375619176454 oab_exams_acc_stderr,exam_id__2010-01: 0.027439944269300902 oab_exams_acc_stderr,exam_id__2010-02: 0.024999054164627513 oab_exams_acc_stderr,exam_id__2011-03: 0.02513759836912599 oab_exams_acc_stderr,exam_id__2011-04: 0.028842769768432266 oab_exams_acc_stderr,exam_id__2011-05: 0.027803429899727953 oab_exams_acc_stderr,exam_id__2012-06: 0.027511514707501687 oab_exams_acc_stderr,exam_id__2012-06a: 0.027969270060875333 oab_exams_acc_stderr,exam_id__2012-07: 0.022965539047482306 oab_exams_acc_stderr,exam_id__2012-08: 0.02739801259080428 oab_exams_acc_stderr,exam_id__2012-09: 0.027845780352732368 oab_exams_acc_stderr,exam_id__2013-10: 0.02636327356327225 oab_exams_acc_stderr,exam_id__2013-11: 0.025163741964027368 oab_exams_acc_stderr,exam_id__2013-12: 0.025170557738075854 oab_exams_acc_stderr,exam_id__2014-13: 0.026432573121099534 oab_exams_acc_stderr,exam_id__2014-14: 0.028840484584019492 oab_exams_acc_stderr,exam_id__2014-15: 0.02693219019792365 oab_exams_acc_stderr,exam_id__2015-16: 0.026872817692190045 oab_exams_acc_stderr,exam_id__2015-17: 0.028004812947648394 oab_exams_acc_stderr,exam_id__2015-18: 0.027398518414074452 oab_exams_acc_stderr,exam_id__2016-19: 0.026437389551520193 oab_exams_acc_stderr,exam_id__2016-20: 0.02754894506133313 oab_exams_acc_stderr,exam_id__2016-20a: 0.02874922000308674 oab_exams_acc_stderr,exam_id__2016-21: 0.025822372019185823 oab_exams_acc_stderr,exam_id__2017-22: 0.027404924686794917 oab_exams_acc_stderr,exam_id__2017-23: 0.026461647581670692 oab_exams_acc_stderr,exam_id__2017-24: 0.026969863667221566 oab_exams_acc_stderr,exam_id__2018-25: 0.029244778204497373 oab_exams_alias: oab_exams portuguese_hate_speech_acc,all: 0.6439482961222092 portuguese_hate_speech_acc_stderr,all: 0.01158607666681163 portuguese_hate_speech_alias: portuguese_hate_speech_binary portuguese_hate_speech_f1_macro,all: 0.44683000920330285 portuguese_hate_speech_f1_macro_stderr,all: 0.010246030132924607 tweetsentbr_acc,all: 0.2880597014925373 tweetsentbr_acc_stderr,all: 0.007133180251905251 tweetsentbr_alias: tweetsentbr tweetsentbr_f1_macro,all: 0.20570459469384028 tweetsentbr_f1_macro_stderr,all: 0.0054860743968753725 step: 195000