{ "results": { "lambada_openai": { "ppl": 3288862.4386760374, "ppl_stderr": 311605.46093383565, "acc": 0.0, "acc_stderr": 0.0 }, "hendrycksTest-astronomy": { "acc": 0.16447368421052633, "acc_stderr": 0.030167533468632723, "acc_norm": 0.2236842105263158, "acc_norm_stderr": 0.033911609343436046 }, "winogrande": { "acc": 0.4940805051302289, "acc_stderr": 0.01405150083848581 }, "hendrycksTest-high_school_us_history": { "acc": 0.18627450980392157, "acc_stderr": 0.027325470966716323, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.030964517926923393 }, "crows_pairs_french_age": { "likelihood_difference": 5.902777777777778, "likelihood_difference_stderr": 0.745349522367746, "pct_stereotype": 0.6777777777777778, "pct_stereotype_stderr": 0.049536623805744535 }, "hendrycksTest-high_school_computer_science": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.36, "acc_norm_stderr": 0.04824181513244218 }, "hendrycksTest-global_facts": { "acc": 0.33, "acc_stderr": 0.047258156262526045, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-high_school_physics": { "acc": 0.2052980132450331, "acc_stderr": 0.03297986648473836, "acc_norm": 0.2980132450331126, "acc_norm_stderr": 0.03734535676787198 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 12.683394451530612, "likelihood_difference_stderr": 0.8321591288729919, "pct_stereotype": 0.45918367346938777, "pct_stereotype_stderr": 0.03568624151230552 }, "hendrycksTest-international_law": { "acc": 0.09917355371900827, "acc_stderr": 0.027285246312758957, "acc_norm": 0.2396694214876033, "acc_norm_stderr": 0.03896878985070417 }, "hendrycksTest-medical_genetics": { "acc": 0.26, "acc_stderr": 0.0440844002276808, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036845 }, "hendrycksTest-logical_fallacies": { "acc": 0.17177914110429449, "acc_stderr": 0.029634717272371013, "acc_norm": 0.25766871165644173, "acc_norm_stderr": 0.03436150827846917 }, "hendrycksTest-moral_disputes": { "acc": 0.22254335260115607, "acc_stderr": 0.02239421566194282, "acc_norm": 0.21965317919075145, "acc_norm_stderr": 0.022289638852617893 }, "crows_pairs_english_disability": { "likelihood_difference": 7.655769230769231, "likelihood_difference_stderr": 1.2456701776455885, "pct_stereotype": 0.6307692307692307, "pct_stereotype_stderr": 0.060324565928300454 }, "hendrycksTest-prehistory": { "acc": 0.25925925925925924, "acc_stderr": 0.02438366553103545, "acc_norm": 0.24382716049382716, "acc_norm_stderr": 0.023891879541959603 }, "hendrycksTest-college_mathematics": { "acc": 0.14, "acc_stderr": 0.0348735088019777, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036623 }, "crows_pairs_french": { "likelihood_difference": 10.100835755813954, "likelihood_difference_stderr": 0.23128974328889199, "pct_stereotype": 0.5819916517590936, "pct_stereotype_stderr": 0.012047969184920519 }, "wsc": { "acc": 0.6346153846153846, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-electrical_engineering": { "acc": 0.2413793103448276, "acc_stderr": 0.03565998174135303, "acc_norm": 0.20689655172413793, "acc_norm_stderr": 0.03375672449560554 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 6.219184027777778, "likelihood_difference_stderr": 0.8156476562247187, "pct_stereotype": 0.5138888888888888, "pct_stereotype_stderr": 0.05931618532716555 }, "hendrycksTest-management": { "acc": 0.1941747572815534, "acc_stderr": 0.03916667762822582, "acc_norm": 0.23300970873786409, "acc_norm_stderr": 0.04185832598928315 }, "hendrycksTest-machine_learning": { "acc": 0.25, "acc_stderr": 0.04109974682633932, "acc_norm": 0.3125, "acc_norm_stderr": 0.043994650575715215 }, "crows_pairs_english_race_color": { "likelihood_difference": 5.5294045275590555, "likelihood_difference_stderr": 0.34271615785671483, "pct_stereotype": 0.36811023622047245, "pct_stereotype_stderr": 0.021419317453594672 }, "hendrycksTest-marketing": { "acc": 0.2222222222222222, "acc_stderr": 0.027236013946196666, "acc_norm": 0.23931623931623933, "acc_norm_stderr": 0.02795182680892433 }, "hendrycksTest-high_school_chemistry": { "acc": 0.20689655172413793, "acc_stderr": 0.028501378167893946, "acc_norm": 0.22167487684729065, "acc_norm_stderr": 0.029225575892489617 }, "hendrycksTest-econometrics": { "acc": 0.24561403508771928, "acc_stderr": 0.0404933929774814, "acc_norm": 0.2807017543859649, "acc_norm_stderr": 0.04227054451232199 }, "hendrycksTest-virology": { "acc": 0.14457831325301204, "acc_stderr": 0.027377874786362316, "acc_norm": 0.18674698795180722, "acc_norm_stderr": 0.030338749144500615 }, "hendrycksTest-high_school_psychology": { "acc": 0.22752293577981653, "acc_stderr": 0.017974463578776502, "acc_norm": 0.24954128440366974, "acc_norm_stderr": 0.01855389762950162 }, "hendrycksTest-high_school_geography": { "acc": 0.25252525252525254, "acc_stderr": 0.030954055470365897, "acc_norm": 0.2474747474747475, "acc_norm_stderr": 0.03074630074212451 }, "sciq": { "acc": 0.223, "acc_stderr": 0.013169830843425661, "acc_norm": 0.21, "acc_norm_stderr": 0.012886662332274547 }, "crows_pairs_french_religion": { "likelihood_difference": 12.11983695652174, "likelihood_difference_stderr": 0.9761138647537818, "pct_stereotype": 0.6608695652173913, "pct_stereotype_stderr": 0.04433930011819816 }, "crows_pairs_english_gender": { "likelihood_difference": 4.749609375, "likelihood_difference_stderr": 0.4877724715110692, "pct_stereotype": 0.48125, "pct_stereotype_stderr": 0.027974934901776306 }, "hendrycksTest-professional_accounting": { "acc": 0.26595744680851063, "acc_stderr": 0.026358065698880582, "acc_norm": 0.25886524822695034, "acc_norm_stderr": 0.026129572527180848 }, "logiqa": { "acc": 0.2196620583717358, "acc_stderr": 0.01623910941493393, "acc_norm": 0.23809523809523808, "acc_norm_stderr": 0.016705867034419633 }, "hendrycksTest-professional_medicine": { "acc": 0.22794117647058823, "acc_stderr": 0.025483081468029804, "acc_norm": 0.2867647058823529, "acc_norm_stderr": 0.027472274473233818 }, "hendrycksTest-world_religions": { "acc": 0.1695906432748538, "acc_stderr": 0.028782108105401712, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03218093795602357 }, "hendrycksTest-sociology": { "acc": 0.2835820895522388, "acc_stderr": 0.03187187537919796, "acc_norm": 0.2935323383084577, "acc_norm_stderr": 0.032200241045342054 }, "hendrycksTest-professional_psychology": { "acc": 0.22058823529411764, "acc_stderr": 0.01677467236546854, "acc_norm": 0.24019607843137256, "acc_norm_stderr": 0.017282760695167435 }, "hendrycksTest-computer_security": { "acc": 0.26, "acc_stderr": 0.04408440022768078, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "hendrycksTest-philosophy": { "acc": 0.2379421221864952, "acc_stderr": 0.024185150647818707, "acc_norm": 0.2861736334405145, "acc_norm_stderr": 0.025670259242188943 }, "crows_pairs_french_race_color": { "likelihood_difference": 9.869972826086956, "likelihood_difference_stderr": 0.3709338879215957, "pct_stereotype": 0.7130434782608696, "pct_stereotype_stderr": 0.021113474740601688 }, "hendrycksTest-clinical_knowledge": { "acc": 0.18490566037735848, "acc_stderr": 0.023893351834464324, "acc_norm": 0.28679245283018867, "acc_norm_stderr": 0.027834912527544067 }, "crows_pairs_english": { "likelihood_difference": 5.480079755515802, "likelihood_difference_stderr": 0.19151850776212573, "pct_stereotype": 0.45855694692904, "pct_stereotype_stderr": 0.012171273580365826 }, "crows_pairs_french_nationality": { "likelihood_difference": 9.49802371541502, "likelihood_difference_stderr": 0.5281355544781192, "pct_stereotype": 0.4980237154150198, "pct_stereotype_stderr": 0.031496793380453074 }, "hendrycksTest-nutrition": { "acc": 0.20915032679738563, "acc_stderr": 0.023287685312334803, "acc_norm": 0.24836601307189543, "acc_norm_stderr": 0.02473998135511359 }, "hendrycksTest-college_medicine": { "acc": 0.19653179190751446, "acc_stderr": 0.030299574664788147, "acc_norm": 0.24855491329479767, "acc_norm_stderr": 0.03295304696818318 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 5.714967105263158, "likelihood_difference_stderr": 0.5307740830599903, "pct_stereotype": 0.5684210526315789, "pct_stereotype_stderr": 0.03602751443822843 }, "crows_pairs_english_autre": { "likelihood_difference": 5.2414772727272725, "likelihood_difference_stderr": 2.881736459713796, "pct_stereotype": 0.7272727272727273, "pct_stereotype_stderr": 0.14083575804390605 }, "hendrycksTest-anatomy": { "acc": 0.2074074074074074, "acc_stderr": 0.03502553170678318, "acc_norm": 0.28888888888888886, "acc_norm_stderr": 0.0391545063041425 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2037037037037037, "acc_stderr": 0.02074274056012268, "acc_norm": 0.21957671957671956, "acc_norm_stderr": 0.021320018599770375 }, "hendrycksTest-high_school_biology": { "acc": 0.2, "acc_stderr": 0.022755204959542936, "acc_norm": 0.22580645161290322, "acc_norm_stderr": 0.02378557788418101 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 5.359206989247312, "likelihood_difference_stderr": 0.7683231947337748, "pct_stereotype": 0.6021505376344086, "pct_stereotype_stderr": 0.0510291122856655 }, "hendrycksTest-high_school_statistics": { "acc": 0.23148148148148148, "acc_stderr": 0.02876511171804696, "acc_norm": 0.2962962962962963, "acc_norm_stderr": 0.031141447823536037 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 10.003472222222221, "likelihood_difference_stderr": 1.3633059287800664, "pct_stereotype": 0.4861111111111111, "pct_stereotype_stderr": 0.059316185327165566 }, "hendrycksTest-formal_logic": { "acc": 0.2857142857142857, "acc_stderr": 0.04040610178208841, "acc_norm": 0.25396825396825395, "acc_norm_stderr": 0.038932596106046706 }, "hendrycksTest-human_sexuality": { "acc": 0.25190839694656486, "acc_stderr": 0.038073871163060866, "acc_norm": 0.25190839694656486, "acc_norm_stderr": 0.038073871163060866 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.23798882681564246, "acc_norm_stderr": 0.014242630070574915 }, "hendrycksTest-abstract_algebra": { "acc": 0.2, "acc_stderr": 0.04020151261036845, "acc_norm": 0.17, "acc_norm_stderr": 0.03775251680686371 }, "arc_easy": { "acc": 0.2668350168350168, "acc_stderr": 0.00907591585926725, "acc_norm": 0.2638888888888889, "acc_norm_stderr": 0.009043789220055139 }, "hendrycksTest-college_chemistry": { "acc": 0.21, "acc_stderr": 0.04093601807403325, "acc_norm": 0.26, "acc_norm_stderr": 0.0440844002276808 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.18067226890756302, "acc_stderr": 0.02499196496660074, "acc_norm": 0.2773109243697479, "acc_norm_stderr": 0.029079374539480007 }, "hendrycksTest-jurisprudence": { "acc": 0.18518518518518517, "acc_stderr": 0.03755265865037181, "acc_norm": 0.24074074074074073, "acc_norm_stderr": 0.041331194402438376 }, "hendrycksTest-college_physics": { "acc": 0.13725490196078433, "acc_stderr": 0.03424084669891521, "acc_norm": 0.20588235294117646, "acc_norm_stderr": 0.04023382273617747 }, "hendrycksTest-public_relations": { "acc": 0.2636363636363636, "acc_stderr": 0.04220224692971987, "acc_norm": 0.20909090909090908, "acc_norm_stderr": 0.03895091015724138 }, "crows_pairs_english_religion": { "likelihood_difference": 5.728322072072072, "likelihood_difference_stderr": 0.6965067589462834, "pct_stereotype": 0.45045045045045046, "pct_stereotype_stderr": 0.04743846177747609 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2230769230769231, "acc_stderr": 0.02110773012724399, "acc_norm": 0.25384615384615383, "acc_norm_stderr": 0.022066054378726257 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 15.282967032967033, "likelihood_difference_stderr": 1.0847203102990313, "pct_stereotype": 0.8021978021978022, "pct_stereotype_stderr": 0.04198895203196222 }, "hendrycksTest-professional_law": { "acc": 0.23533246414602346, "acc_stderr": 0.010834432543912219, "acc_norm": 0.25684485006518903, "acc_norm_stderr": 0.011158455853098851 }, "hendrycksTest-high_school_world_history": { "acc": 0.189873417721519, "acc_stderr": 0.025530100460233494, "acc_norm": 0.22362869198312235, "acc_norm_stderr": 0.02712329820522997 }, "hendrycksTest-business_ethics": { "acc": 0.26, "acc_stderr": 0.044084400227680794, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "crows_pairs_french_autre": { "likelihood_difference": 7.216346153846154, "likelihood_difference_stderr": 1.9704931663267538, "pct_stereotype": 0.46153846153846156, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-conceptual_physics": { "acc": 0.2723404255319149, "acc_stderr": 0.029101290698386708, "acc_norm": 0.25957446808510637, "acc_norm_stderr": 0.02865917937429232 }, "crows_pairs_english_age": { "likelihood_difference": 3.418269230769231, "likelihood_difference_stderr": 0.6082631522720632, "pct_stereotype": 0.5274725274725275, "pct_stereotype_stderr": 0.05262501097748859 }, "hendrycksTest-us_foreign_policy": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "arc_challenge": { "acc": 0.20477815699658702, "acc_stderr": 0.01179254433851342, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.01255144762785626 }, "hendrycksTest-high_school_european_history": { "acc": 0.17575757575757575, "acc_stderr": 0.02972094300622445, "acc_norm": 0.22424242424242424, "acc_norm_stderr": 0.03256866661681102 }, "hendrycksTest-miscellaneous": { "acc": 0.2503192848020434, "acc_stderr": 0.015491088951494588, "acc_norm": 0.25287356321839083, "acc_norm_stderr": 0.015543377313719681 }, "hendrycksTest-college_biology": { "acc": 0.2569444444444444, "acc_stderr": 0.03653946969442099, "acc_norm": 0.25, "acc_norm_stderr": 0.03621034121889507 }, "crows_pairs_english_nationality": { "likelihood_difference": 6.14380787037037, "likelihood_difference_stderr": 0.5217915071777064, "pct_stereotype": 0.37037037037037035, "pct_stereotype_stderr": 0.03293377139415191 }, "crows_pairs_french_gender": { "likelihood_difference": 7.492017133956386, "likelihood_difference_stderr": 0.4566662635366117, "pct_stereotype": 0.48286604361370716, "pct_stereotype_stderr": 0.027934433698537306 }, "piqa": { "acc": 0.5272034820457019, "acc_stderr": 0.011648545262429021, "acc_norm": 0.5261153427638737, "acc_norm_stderr": 0.011649900854263415 }, "hendrycksTest-security_studies": { "acc": 0.31020408163265306, "acc_stderr": 0.02961345987248438, "acc_norm": 0.19183673469387755, "acc_norm_stderr": 0.025206963154225378 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.19170984455958548, "acc_stderr": 0.02840895362624527, "acc_norm": 0.24870466321243523, "acc_norm_stderr": 0.03119584087770028 }, "crows_pairs_french_disability": { "likelihood_difference": 14.775568181818182, "likelihood_difference_stderr": 1.4715579883772572, "pct_stereotype": 0.3939393939393939, "pct_stereotype_stderr": 0.06060606060606062 }, "hendrycksTest-college_computer_science": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.17, "acc_norm_stderr": 0.03775251680686371 }, "hendrycksTest-high_school_mathematics": { "acc": 0.2074074074074074, "acc_stderr": 0.02472071319395215, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.026962424325073824 }, "hendrycksTest-human_aging": { "acc": 0.273542600896861, "acc_stderr": 0.029918586707798824, "acc_norm": 0.27802690582959644, "acc_norm_stderr": 0.030069584874494033 } }, "versions": { "lambada_openai": 0, "hendrycksTest-astronomy": 0, "winogrande": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_french_age": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-international_law": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-moral_disputes": 0, "crows_pairs_english_disability": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-college_mathematics": 0, "crows_pairs_french": 0, "wsc": 0, "hendrycksTest-electrical_engineering": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-management": 0, "hendrycksTest-machine_learning": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-marketing": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-virology": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-high_school_geography": 0, "sciq": 0, "crows_pairs_french_religion": 0, "crows_pairs_english_gender": 0, "hendrycksTest-professional_accounting": 0, "logiqa": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-sociology": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-philosophy": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-clinical_knowledge": 0, "crows_pairs_english": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_english_socioeconomic": 0, "crows_pairs_english_autre": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-high_school_biology": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-abstract_algebra": 0, "arc_easy": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-public_relations": 0, "crows_pairs_english_religion": 0, "hendrycksTest-high_school_macroeconomics": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-business_ethics": 0, "crows_pairs_french_autre": 0, "hendrycksTest-conceptual_physics": 0, "crows_pairs_english_age": 0, "hendrycksTest-us_foreign_policy": 0, "arc_challenge": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-college_biology": 0, "crows_pairs_english_nationality": 0, "crows_pairs_french_gender": 0, "piqa": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-high_school_government_and_politics": 0, "crows_pairs_french_disability": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-human_aging": 0 }, "config": { "model": "hf-causal", "model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step32", "num_fewshot": 0, "batch_size": 32, "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }