{ "results": { "crows_pairs_french_gender": { "likelihood_difference": 5.687986760124611, "likelihood_difference_stderr": 0.2761406584883121, "pct_stereotype": 0.470404984423676, "pct_stereotype_stderr": 0.02790184442005117 }, "hendrycksTest-marketing": { "acc": 0.23076923076923078, "acc_stderr": 0.027601921381417604, "acc_norm": 0.23076923076923078, "acc_norm_stderr": 0.027601921381417604 }, "hendrycksTest-high_school_psychology": { "acc": 0.24587155963302754, "acc_stderr": 0.01846194096870845, "acc_norm": 0.26972477064220185, "acc_norm_stderr": 0.019028486711115445 }, "hendrycksTest-college_chemistry": { "acc": 0.23, "acc_stderr": 0.04229525846816507, "acc_norm": 0.23, "acc_norm_stderr": 0.042295258468165065 }, "hendrycksTest-abstract_algebra": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.14, "acc_norm_stderr": 0.03487350880197772 }, "hendrycksTest-high_school_chemistry": { "acc": 0.1921182266009852, "acc_stderr": 0.02771931570961477, "acc_norm": 0.22167487684729065, "acc_norm_stderr": 0.029225575892489614 }, "hendrycksTest-econometrics": { "acc": 0.2543859649122807, "acc_stderr": 0.040969851398436695, "acc_norm": 0.2719298245614035, "acc_norm_stderr": 0.04185774424022056 }, "crows_pairs_english_disability": { "likelihood_difference": 6.718269230769231, "likelihood_difference_stderr": 0.8596632745046646, "pct_stereotype": 0.4461538461538462, "pct_stereotype_stderr": 0.06213651700539812 }, "hendrycksTest-high_school_computer_science": { "acc": 0.19, "acc_stderr": 0.03942772444036623, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "crows_pairs_french_nationality": { "likelihood_difference": 6.856966403162056, "likelihood_difference_stderr": 0.32916109908316876, "pct_stereotype": 0.5454545454545454, "pct_stereotype_stderr": 0.0313666163337434 }, "hendrycksTest-public_relations": { "acc": 0.2818181818181818, "acc_stderr": 0.04309118709946458, "acc_norm": 0.18181818181818182, "acc_norm_stderr": 0.036942843353377997 }, "crows_pairs_english_age": { "likelihood_difference": 3.380837912087912, "likelihood_difference_stderr": 0.49983969692767516, "pct_stereotype": 0.5164835164835165, "pct_stereotype_stderr": 0.05267597952306975 }, "logiqa": { "acc": 0.21044546850998463, "acc_stderr": 0.015988369488888755, "acc_norm": 0.23348694316436253, "acc_norm_stderr": 0.016593362460570887 }, "hendrycksTest-human_aging": { "acc": 0.3273542600896861, "acc_stderr": 0.03149384670994131, "acc_norm": 0.29596412556053814, "acc_norm_stderr": 0.03063659134869981 }, "hendrycksTest-formal_logic": { "acc": 0.2698412698412698, "acc_stderr": 0.03970158273235172, "acc_norm": 0.30158730158730157, "acc_norm_stderr": 0.04104947269903394 }, "hendrycksTest-clinical_knowledge": { "acc": 0.18490566037735848, "acc_stderr": 0.023893351834464324, "acc_norm": 0.3169811320754717, "acc_norm_stderr": 0.02863723563980091 }, "crows_pairs_french_autre": { "likelihood_difference": 5.5, "likelihood_difference_stderr": 0.9504975080559196, "pct_stereotype": 0.5384615384615384, "pct_stereotype_stderr": 0.14390989949130545 }, "hendrycksTest-moral_disputes": { "acc": 0.2254335260115607, "acc_stderr": 0.02249723019096755, "acc_norm": 0.22832369942196531, "acc_norm_stderr": 0.022598703804321624 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 5.677083333333333, "likelihood_difference_stderr": 0.6194240763408452, "pct_stereotype": 0.6451612903225806, "pct_stereotype_stderr": 0.049883363937668256 }, "hendrycksTest-professional_psychology": { "acc": 0.25326797385620914, "acc_stderr": 0.01759348689536683, "acc_norm": 0.272875816993464, "acc_norm_stderr": 0.018020474148393577 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.18907563025210083, "acc_stderr": 0.025435119438105357, "acc_norm": 0.2773109243697479, "acc_norm_stderr": 0.02907937453948001 }, "hendrycksTest-high_school_statistics": { "acc": 0.2175925925925926, "acc_stderr": 0.028139689444859645, "acc_norm": 0.23148148148148148, "acc_norm_stderr": 0.028765111718046944 }, "crows_pairs_english_gender": { "likelihood_difference": 4.05078125, "likelihood_difference_stderr": 0.38152405690444796, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.027994625547792713 }, "wsc": { "acc": 0.6346153846153846, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-high_school_us_history": { "acc": 0.18627450980392157, "acc_stderr": 0.027325470966716336, "acc_norm": 0.21568627450980393, "acc_norm_stderr": 0.028867431449849313 }, "crows_pairs_english_religion": { "likelihood_difference": 5.009853603603603, "likelihood_difference_stderr": 0.5228133914951523, "pct_stereotype": 0.5855855855855856, "pct_stereotype_stderr": 0.04696953631102271 }, "sciq": { "acc": 0.228, "acc_stderr": 0.013273740700804483, "acc_norm": 0.236, "acc_norm_stderr": 0.013434451402438685 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 4.72265625, "likelihood_difference_stderr": 0.5793499299137083, "pct_stereotype": 0.5555555555555556, "pct_stereotype_stderr": 0.05897165471491952 }, "hendrycksTest-machine_learning": { "acc": 0.2767857142857143, "acc_stderr": 0.042466243366976256, "acc_norm": 0.25, "acc_norm_stderr": 0.04109974682633932 }, "hendrycksTest-prehistory": { "acc": 0.28703703703703703, "acc_stderr": 0.02517104191530968, "acc_norm": 0.24382716049382716, "acc_norm_stderr": 0.023891879541959593 }, "hendrycksTest-sociology": { "acc": 0.23383084577114427, "acc_stderr": 0.029929415408348384, "acc_norm": 0.24875621890547264, "acc_norm_stderr": 0.03056767593891672 }, "hendrycksTest-global_facts": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "crows_pairs_french": { "likelihood_difference": 6.74689736135957, "likelihood_difference_stderr": 0.15103608824599826, "pct_stereotype": 0.5533691115086464, "pct_stereotype_stderr": 0.012143526564900555 }, "hendrycksTest-medical_genetics": { "acc": 0.29, "acc_stderr": 0.045604802157206845, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "hendrycksTest-high_school_mathematics": { "acc": 0.12222222222222222, "acc_stderr": 0.019970605780284603, "acc_norm": 0.1814814814814815, "acc_norm_stderr": 0.023499264669407282 }, "hendrycksTest-college_biology": { "acc": 0.2222222222222222, "acc_stderr": 0.034765901043041336, "acc_norm": 0.20833333333333334, "acc_norm_stderr": 0.03396116205845335 }, "hendrycksTest-conceptual_physics": { "acc": 0.251063829787234, "acc_stderr": 0.02834696377716246, "acc_norm": 0.2, "acc_norm_stderr": 0.0261488180184245 }, "hendrycksTest-moral_scenarios": { "acc": 0.23798882681564246, "acc_stderr": 0.014242630070574915, "acc_norm": 0.27262569832402234, "acc_norm_stderr": 0.014893391735249588 }, "hendrycksTest-jurisprudence": { "acc": 0.1574074074074074, "acc_stderr": 0.03520703990517965, "acc_norm": 0.21296296296296297, "acc_norm_stderr": 0.039578354719809805 }, "crows_pairs_english": { "likelihood_difference": 4.661393112701252, "likelihood_difference_stderr": 0.13998586074905606, "pct_stereotype": 0.456768038163387, "pct_stereotype_stderr": 0.012167560197793078 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 13.163461538461538, "likelihood_difference_stderr": 0.8325716351947234, "pct_stereotype": 0.7912087912087912, "pct_stereotype_stderr": 0.042843052065094304 }, "hendrycksTest-management": { "acc": 0.23300970873786409, "acc_stderr": 0.04185832598928315, "acc_norm": 0.2815533980582524, "acc_norm_stderr": 0.04453254836326467 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 4.904440789473684, "likelihood_difference_stderr": 0.4062917141669697, "pct_stereotype": 0.48947368421052634, "pct_stereotype_stderr": 0.036361587723547695 }, "hendrycksTest-logical_fallacies": { "acc": 0.22085889570552147, "acc_stderr": 0.032591773927421776, "acc_norm": 0.3128834355828221, "acc_norm_stderr": 0.036429145782924055 }, "hendrycksTest-astronomy": { "acc": 0.20394736842105263, "acc_stderr": 0.032790004063100495, "acc_norm": 0.27631578947368424, "acc_norm_stderr": 0.03639057569952925 }, "crows_pairs_english_autre": { "likelihood_difference": 6.349431818181818, "likelihood_difference_stderr": 2.804745680840638, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "hendrycksTest-high_school_world_history": { "acc": 0.17721518987341772, "acc_stderr": 0.02485636418450322, "acc_norm": 0.25738396624472576, "acc_norm_stderr": 0.028458820991460295 }, "hendrycksTest-professional_medicine": { "acc": 0.25, "acc_stderr": 0.026303648393696036, "acc_norm": 0.25, "acc_norm_stderr": 0.026303648393696036 }, "hendrycksTest-college_computer_science": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.21, "acc_norm_stderr": 0.040936018074033256 }, "lambada_openai": { "ppl": 705314.6370389248, "ppl_stderr": 50610.68705557734, "acc": 0.0, "acc_stderr": 0.0 }, "hendrycksTest-college_medicine": { "acc": 0.23699421965317918, "acc_stderr": 0.03242414757483098, "acc_norm": 0.2658959537572254, "acc_norm_stderr": 0.03368762932259432 }, "arc_easy": { "acc": 0.27441077441077444, "acc_stderr": 0.00915617712224453, "acc_norm": 0.2849326599326599, "acc_norm_stderr": 0.009262170695590658 }, "hendrycksTest-security_studies": { "acc": 0.3306122448979592, "acc_stderr": 0.030116426296540613, "acc_norm": 0.20408163265306123, "acc_norm_stderr": 0.025801283475090506 }, "winogrande": { "acc": 0.4925019731649566, "acc_stderr": 0.014050905521228577 }, "crows_pairs_english_nationality": { "likelihood_difference": 5.4428530092592595, "likelihood_difference_stderr": 0.3840752204417463, "pct_stereotype": 0.3333333333333333, "pct_stereotype_stderr": 0.03214952147802749 }, "arc_challenge": { "acc": 0.19965870307167236, "acc_stderr": 0.011681625756888669, "acc_norm": 0.24146757679180889, "acc_norm_stderr": 0.01250656483973943 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.04163331998932268, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394 }, "hendrycksTest-world_religions": { "acc": 0.1695906432748538, "acc_stderr": 0.028782108105401712, "acc_norm": 0.25146198830409355, "acc_norm_stderr": 0.033275044238468436 }, "crows_pairs_french_age": { "likelihood_difference": 4.167361111111111, "likelihood_difference_stderr": 0.49130810000225555, "pct_stereotype": 0.4111111111111111, "pct_stereotype_stderr": 0.052155640611075534 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2275132275132275, "acc_stderr": 0.021591269407823778, "acc_norm": 0.21164021164021163, "acc_norm_stderr": 0.02103733150526289 }, "hendrycksTest-international_law": { "acc": 0.10743801652892562, "acc_stderr": 0.02826881219254063, "acc_norm": 0.2396694214876033, "acc_norm_stderr": 0.03896878985070417 }, "crows_pairs_french_disability": { "likelihood_difference": 10.162878787878787, "likelihood_difference_stderr": 1.04556369991972, "pct_stereotype": 0.3333333333333333, "pct_stereotype_stderr": 0.0584705346204686 }, "hendrycksTest-miscellaneous": { "acc": 0.23627075351213284, "acc_stderr": 0.015190473717037498, "acc_norm": 0.25287356321839083, "acc_norm_stderr": 0.015543377313719681 }, "hendrycksTest-high_school_european_history": { "acc": 0.16363636363636364, "acc_stderr": 0.028887872395487953, "acc_norm": 0.24242424242424243, "acc_norm_stderr": 0.03346409881055953 }, "crows_pairs_french_religion": { "likelihood_difference": 7.765760869565217, "likelihood_difference_stderr": 0.49195584086877725, "pct_stereotype": 0.6869565217391305, "pct_stereotype_stderr": 0.043432470166108225 }, "hendrycksTest-professional_accounting": { "acc": 0.25886524822695034, "acc_stderr": 0.026129572527180848, "acc_norm": 0.2730496453900709, "acc_norm_stderr": 0.02657786094330786 }, "hendrycksTest-high_school_geography": { "acc": 0.18686868686868688, "acc_stderr": 0.027772533334218977, "acc_norm": 0.30303030303030304, "acc_norm_stderr": 0.032742879140268674 }, "hendrycksTest-anatomy": { "acc": 0.2074074074074074, "acc_stderr": 0.03502553170678319, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.03785714465066653 }, "hendrycksTest-philosophy": { "acc": 0.2379421221864952, "acc_stderr": 0.02418515064781871, "acc_norm": 0.2990353697749196, "acc_norm_stderr": 0.02600330111788513 }, "crows_pairs_english_race_color": { "likelihood_difference": 4.281742125984252, "likelihood_difference_stderr": 0.21780058915583433, "pct_stereotype": 0.3838582677165354, "pct_stereotype_stderr": 0.021598410071068296 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.19689119170984457, "acc_stderr": 0.028697873971860674, "acc_norm": 0.2538860103626943, "acc_norm_stderr": 0.03141024780565318 }, "hendrycksTest-high_school_physics": { "acc": 0.2052980132450331, "acc_stderr": 0.03297986648473836, "acc_norm": 0.24503311258278146, "acc_norm_stderr": 0.035118075718047245 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 7.983976403061225, "likelihood_difference_stderr": 0.545579868210259, "pct_stereotype": 0.34183673469387754, "pct_stereotype_stderr": 0.033967132039868675 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.19743589743589743, "acc_stderr": 0.02018264696867484, "acc_norm": 0.22564102564102564, "acc_norm_stderr": 0.02119363252514852 }, "hendrycksTest-human_sexuality": { "acc": 0.29770992366412213, "acc_stderr": 0.04010358942462203, "acc_norm": 0.2824427480916031, "acc_norm_stderr": 0.03948406125768361 }, "hendrycksTest-electrical_engineering": { "acc": 0.25517241379310346, "acc_stderr": 0.03632984052707842, "acc_norm": 0.2689655172413793, "acc_norm_stderr": 0.036951833116502325 }, "hendrycksTest-us_foreign_policy": { "acc": 0.23, "acc_stderr": 0.04229525846816505, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "crows_pairs_french_race_color": { "likelihood_difference": 5.3552989130434785, "likelihood_difference_stderr": 0.2271004698936648, "pct_stereotype": 0.6869565217391305, "pct_stereotype_stderr": 0.021645150653106047 }, "piqa": { "acc": 0.5179542981501633, "acc_stderr": 0.011658300623287153, "acc_norm": 0.515778019586507, "acc_norm_stderr": 0.011660014400426182 }, "hendrycksTest-virology": { "acc": 0.22289156626506024, "acc_stderr": 0.03240004825594688, "acc_norm": 0.25301204819277107, "acc_norm_stderr": 0.03384429155233137 }, "hendrycksTest-college_mathematics": { "acc": 0.16, "acc_stderr": 0.03684529491774708, "acc_norm": 0.17, "acc_norm_stderr": 0.0377525168068637 }, "hendrycksTest-high_school_biology": { "acc": 0.2129032258064516, "acc_stderr": 0.02328766512726853, "acc_norm": 0.23870967741935484, "acc_norm_stderr": 0.024251071262208837 }, "hendrycksTest-professional_law": { "acc": 0.242503259452412, "acc_stderr": 0.010946570966348783, "acc_norm": 0.2711864406779661, "acc_norm_stderr": 0.011354581451622986 }, "hendrycksTest-college_physics": { "acc": 0.20588235294117646, "acc_stderr": 0.04023382273617747, "acc_norm": 0.19607843137254902, "acc_norm_stderr": 0.03950581861179962 }, "hendrycksTest-nutrition": { "acc": 0.19607843137254902, "acc_stderr": 0.022733789405447593, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.025829163272757482 }, "hendrycksTest-business_ethics": { "acc": 0.32, "acc_stderr": 0.04688261722621505, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720684 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 7.185329861111111, "likelihood_difference_stderr": 0.9560662240150144, "pct_stereotype": 0.5416666666666666, "pct_stereotype_stderr": 0.05913268547421809 } }, "versions": { "crows_pairs_french_gender": 0, "hendrycksTest-marketing": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-econometrics": 0, "crows_pairs_english_disability": 0, "hendrycksTest-high_school_computer_science": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-public_relations": 0, "crows_pairs_english_age": 0, "logiqa": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-clinical_knowledge": 0, "crows_pairs_french_autre": 0, "hendrycksTest-moral_disputes": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-high_school_statistics": 0, "crows_pairs_english_gender": 0, "wsc": 0, "hendrycksTest-high_school_us_history": 0, "crows_pairs_english_religion": 0, "sciq": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-sociology": 0, "hendrycksTest-global_facts": 0, "crows_pairs_french": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-high_school_mathematics": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-jurisprudence": 0, "crows_pairs_english": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-management": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-astronomy": 0, "crows_pairs_english_autre": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-college_computer_science": 0, "lambada_openai": 0, "hendrycksTest-college_medicine": 0, "arc_easy": 0, "hendrycksTest-security_studies": 0, "winogrande": 0, "crows_pairs_english_nationality": 0, "arc_challenge": 0, "hendrycksTest-computer_security": 0, "hendrycksTest-world_religions": 0, "crows_pairs_french_age": 0, "hendrycksTest-elementary_mathematics": 0, "hendrycksTest-international_law": 0, "crows_pairs_french_disability": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-high_school_european_history": 0, "crows_pairs_french_religion": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-philosophy": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-human_sexuality": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-us_foreign_policy": 0, "crows_pairs_french_race_color": 0, "piqa": 0, "hendrycksTest-virology": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-business_ethics": 0, "crows_pairs_french_physical_appearance": 0 }, "config": { "model": "hf-causal", "model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step256", "num_fewshot": 0, "batch_size": 32, "device": null, "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }