{ "results": { "crows_pairs_english_age": { "likelihood_difference": 2.6893887362637363, "likelihood_difference_stderr": 0.2680170470214308, "pct_stereotype": 0.5494505494505495, "pct_stereotype_stderr": 0.05244623100101224 }, "hendrycksTest-college_mathematics": { "acc": 0.2, "acc_stderr": 0.040201512610368445, "acc_norm": 0.28, "acc_norm_stderr": 0.045126085985421276 }, "hendrycksTest-college_medicine": { "acc": 0.26011560693641617, "acc_stderr": 0.03345036916788991, "acc_norm": 0.28901734104046245, "acc_norm_stderr": 0.034564257450869995 }, "crows_pairs_english_autre": { "likelihood_difference": 4.231534090909091, "likelihood_difference_stderr": 1.4093980202807441, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "hendrycksTest-college_physics": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179964, "acc_norm": 0.22549019607843138, "acc_norm_stderr": 0.04158307533083286 }, "hendrycksTest-prehistory": { "acc": 0.25617283950617287, "acc_stderr": 0.0242885336377261, "acc_norm": 0.2345679012345679, "acc_norm_stderr": 0.023576881744005716 }, "hendrycksTest-college_computer_science": { "acc": 0.23, "acc_stderr": 0.04229525846816506, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816505 }, "hendrycksTest-professional_psychology": { "acc": 0.25326797385620914, "acc_stderr": 0.01759348689536683, "acc_norm": 0.2581699346405229, "acc_norm_stderr": 0.01770453165325007 }, "hendrycksTest-management": { "acc": 0.1941747572815534, "acc_stderr": 0.03916667762822584, "acc_norm": 0.32038834951456313, "acc_norm_stderr": 0.0462028408228004 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 6.262362637362638, "likelihood_difference_stderr": 0.4610653877296226, "pct_stereotype": 0.8021978021978022, "pct_stereotype_stderr": 0.04198895203196222 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.503783218503937, "likelihood_difference_stderr": 0.1827001921291112, "pct_stereotype": 0.47834645669291337, "pct_stereotype_stderr": 0.022184946299954114 }, "hendrycksTest-philosophy": { "acc": 0.21221864951768488, "acc_stderr": 0.023222756797435122, "acc_norm": 0.26688102893890675, "acc_norm_stderr": 0.025122637608816632 }, "logiqa": { "acc": 0.21658986175115208, "acc_stderr": 0.0161568605831783, "acc_norm": 0.29185867895545314, "acc_norm_stderr": 0.01783157055397193 }, "crows_pairs_french_autre": { "likelihood_difference": 4.449519230769231, "likelihood_difference_stderr": 0.9182419616034272, "pct_stereotype": 0.3076923076923077, "pct_stereotype_stderr": 0.13323467750529824 }, "crows_pairs_english_disability": { "likelihood_difference": 5.5115384615384615, "likelihood_difference_stderr": 0.5769147322613659, "pct_stereotype": 0.6461538461538462, "pct_stereotype_stderr": 0.05977027026123099 }, "arc_challenge": { "acc": 0.17747440273037543, "acc_stderr": 0.011165138769643972, "acc_norm": 0.23037542662116042, "acc_norm_stderr": 0.01230492841874761 }, "hendrycksTest-high_school_european_history": { "acc": 0.23636363636363636, "acc_stderr": 0.033175059300091805, "acc_norm": 0.24848484848484848, "acc_norm_stderr": 0.033744026441394036 }, "hendrycksTest-international_law": { "acc": 0.17355371900826447, "acc_stderr": 0.03457272836917671, "acc_norm": 0.371900826446281, "acc_norm_stderr": 0.04412015806624504 }, "hendrycksTest-formal_logic": { "acc": 0.30158730158730157, "acc_stderr": 0.04104947269903394, "acc_norm": 0.2857142857142857, "acc_norm_stderr": 0.0404061017820884 }, "hendrycksTest-miscellaneous": { "acc": 0.2656449553001277, "acc_stderr": 0.015794302487888715, "acc_norm": 0.24521072796934865, "acc_norm_stderr": 0.015384352284543944 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.2694300518134715, "acc_stderr": 0.03201867122877794, "acc_norm": 0.26424870466321243, "acc_norm_stderr": 0.03182155050916647 }, "hendrycksTest-jurisprudence": { "acc": 0.18518518518518517, "acc_stderr": 0.03755265865037181, "acc_norm": 0.37037037037037035, "acc_norm_stderr": 0.04668408033024931 }, "hendrycksTest-high_school_statistics": { "acc": 0.19907407407407407, "acc_stderr": 0.02723229846269024, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.02988691054762698 }, "hendrycksTest-human_sexuality": { "acc": 0.26717557251908397, "acc_stderr": 0.03880848301082395, "acc_norm": 0.25190839694656486, "acc_norm_stderr": 0.03807387116306086 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.7981770833333335, "likelihood_difference_stderr": 0.4326219499395864, "pct_stereotype": 0.625, "pct_stereotype_stderr": 0.05745481997211521 }, "crows_pairs_english": { "likelihood_difference": 3.580799045915325, "likelihood_difference_stderr": 0.10390408640595741, "pct_stereotype": 0.5468097793679189, "pct_stereotype_stderr": 0.012159658951661536 }, "hendrycksTest-high_school_computer_science": { "acc": 0.22, "acc_stderr": 0.0416333199893227, "acc_norm": 0.35, "acc_norm_stderr": 0.04793724854411021 }, "hendrycksTest-clinical_knowledge": { "acc": 0.21509433962264152, "acc_stderr": 0.025288394502891363, "acc_norm": 0.2981132075471698, "acc_norm_stderr": 0.02815283794249386 }, "arc_easy": { "acc": 0.36574074074074076, "acc_stderr": 0.009882988069418829, "acc_norm": 0.34385521885521886, "acc_norm_stderr": 0.009746660584852448 }, "hendrycksTest-moral_disputes": { "acc": 0.2658959537572254, "acc_stderr": 0.023786203255508277, "acc_norm": 0.30057803468208094, "acc_norm_stderr": 0.024685316867257796 }, "hendrycksTest-college_biology": { "acc": 0.2708333333333333, "acc_stderr": 0.03716177437566014, "acc_norm": 0.2569444444444444, "acc_norm_stderr": 0.03653946969442099 }, "hendrycksTest-high_school_world_history": { "acc": 0.2616033755274262, "acc_stderr": 0.028609516716994934, "acc_norm": 0.270042194092827, "acc_norm_stderr": 0.028900721906293426 }, "hendrycksTest-nutrition": { "acc": 0.26143790849673204, "acc_stderr": 0.025160998214292456, "acc_norm": 0.3431372549019608, "acc_norm_stderr": 0.027184498909941616 }, "hendrycksTest-us_foreign_policy": { "acc": 0.21, "acc_stderr": 0.040936018074033256, "acc_norm": 0.32, "acc_norm_stderr": 0.046882617226215034 }, "hendrycksTest-abstract_algebra": { "acc": 0.23, "acc_stderr": 0.04229525846816508, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-econometrics": { "acc": 0.2543859649122807, "acc_stderr": 0.04096985139843671, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748141 }, "crows_pairs_french_age": { "likelihood_difference": 4.4215277777777775, "likelihood_difference_stderr": 0.4586681734751123, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.052999894000318 }, "hendrycksTest-high_school_mathematics": { "acc": 0.1814814814814815, "acc_stderr": 0.0234992646694073, "acc_norm": 0.23703703703703705, "acc_norm_stderr": 0.0259288761327661 }, "crows_pairs_french_religion": { "likelihood_difference": 4.770652173913043, "likelihood_difference_stderr": 0.5382010210337033, "pct_stereotype": 0.4782608695652174, "pct_stereotype_stderr": 0.04678500755208439 }, "hendrycksTest-electrical_engineering": { "acc": 0.2827586206896552, "acc_stderr": 0.03752833958003337, "acc_norm": 0.30344827586206896, "acc_norm_stderr": 0.038312260488503336 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.583423913043478, "likelihood_difference_stderr": 0.25993020575528686, "pct_stereotype": 0.3869565217391304, "pct_stereotype_stderr": 0.02273371341289454 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 5.456672512755102, "likelihood_difference_stderr": 0.4238880074973118, "pct_stereotype": 0.5153061224489796, "pct_stereotype_stderr": 0.03578896281770489 }, "hendrycksTest-security_studies": { "acc": 0.27755102040816326, "acc_stderr": 0.02866685779027465, "acc_norm": 0.24897959183673468, "acc_norm_stderr": 0.027682979522960234 }, "wsc": { "acc": 0.375, "acc_stderr": 0.04770204856076104 }, "hendrycksTest-high_school_psychology": { "acc": 0.26422018348623855, "acc_stderr": 0.018904164171510186, "acc_norm": 0.25137614678899084, "acc_norm_stderr": 0.018599206360287415 }, "crows_pairs_french_gender": { "likelihood_difference": 4.443341121495327, "likelihood_difference_stderr": 0.22479210100562896, "pct_stereotype": 0.5327102803738317, "pct_stereotype_stderr": 0.027890972865217977 }, "lambada_openai": { "ppl": 133.11058088169239, "ppl_stderr": 5.629178590421373, "acc": 0.20415292062875995, "acc_stderr": 0.005615710162255026 }, "winogrande": { "acc": 0.5027624309392266, "acc_stderr": 0.014052271211616436 }, "hendrycksTest-marketing": { "acc": 0.2692307692307692, "acc_stderr": 0.029058588303748845, "acc_norm": 0.32051282051282054, "acc_norm_stderr": 0.030572811310299607 }, "hendrycksTest-public_relations": { "acc": 0.3090909090909091, "acc_stderr": 0.044262946482000985, "acc_norm": 0.2, "acc_norm_stderr": 0.03831305140884601 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.23529411764705882, "acc_stderr": 0.027553614467863818, "acc_norm": 0.3403361344537815, "acc_norm_stderr": 0.030778057422931673 }, "hendrycksTest-astronomy": { "acc": 0.21710526315789475, "acc_stderr": 0.033550453048829226, "acc_norm": 0.3815789473684211, "acc_norm_stderr": 0.03953173377749194 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 5.526475694444445, "likelihood_difference_stderr": 0.6334563568346864, "pct_stereotype": 0.5, "pct_stereotype_stderr": 0.05933908290969268 }, "hendrycksTest-high_school_chemistry": { "acc": 0.15270935960591134, "acc_stderr": 0.02530890453938063, "acc_norm": 0.26108374384236455, "acc_norm_stderr": 0.030903796952114468 }, "hendrycksTest-logical_fallacies": { "acc": 0.20245398773006135, "acc_stderr": 0.03157065078911901, "acc_norm": 0.2883435582822086, "acc_norm_stderr": 0.035590395316173425 }, "hendrycksTest-high_school_biology": { "acc": 0.24838709677419354, "acc_stderr": 0.02458002892148101, "acc_norm": 0.3225806451612903, "acc_norm_stderr": 0.02659308451657228 }, "hendrycksTest-high_school_physics": { "acc": 0.2119205298013245, "acc_stderr": 0.03336767086567978, "acc_norm": 0.2251655629139073, "acc_norm_stderr": 0.03410435282008936 }, "piqa": { "acc": 0.5859630032644179, "acc_stderr": 0.011492118481417575, "acc_norm": 0.5875952121871599, "acc_norm_stderr": 0.011485407152743137 }, "hendrycksTest-college_chemistry": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.32, "acc_norm_stderr": 0.04688261722621503 }, "hendrycksTest-human_aging": { "acc": 0.2645739910313901, "acc_stderr": 0.029605103217038332, "acc_norm": 0.2556053811659193, "acc_norm_stderr": 0.029275891003969927 }, "crows_pairs_english_religion": { "likelihood_difference": 3.5471565315315314, "likelihood_difference_stderr": 0.4012612135638327, "pct_stereotype": 0.6396396396396397, "pct_stereotype_stderr": 0.045776211670703136 }, "hendrycksTest-professional_law": { "acc": 0.23663624511082137, "acc_stderr": 0.010855137351572737, "acc_norm": 0.26140808344198174, "acc_norm_stderr": 0.011222528169771309 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.406754032258065, "likelihood_difference_stderr": 0.5393566741888578, "pct_stereotype": 0.7741935483870968, "pct_stereotype_stderr": 0.043591220947882314 }, "crows_pairs_english_gender": { "likelihood_difference": 3.047314453125, "likelihood_difference_stderr": 0.28201771347734694, "pct_stereotype": 0.546875, "pct_stereotype_stderr": 0.027871330781745147 }, "crows_pairs_french_nationality": { "likelihood_difference": 7.70998023715415, "likelihood_difference_stderr": 0.4337314450386741, "pct_stereotype": 0.3201581027667984, "pct_stereotype_stderr": 0.029389076633931355 }, "hendrycksTest-elementary_mathematics": { "acc": 0.21957671957671956, "acc_stderr": 0.021320018599770355, "acc_norm": 0.2222222222222222, "acc_norm_stderr": 0.021411684393694196 }, "sciq": { "acc": 0.614, "acc_stderr": 0.01540263747678437, "acc_norm": 0.559, "acc_norm_stderr": 0.015708779894242676 }, "hendrycksTest-professional_accounting": { "acc": 0.24822695035460993, "acc_stderr": 0.025770015644290382, "acc_norm": 0.25886524822695034, "acc_norm_stderr": 0.026129572527180848 }, "hendrycksTest-high_school_geography": { "acc": 0.25757575757575757, "acc_stderr": 0.031156269519646847, "acc_norm": 0.31313131313131315, "acc_norm_stderr": 0.03304205087813653 }, "hendrycksTest-sociology": { "acc": 0.2835820895522388, "acc_stderr": 0.03187187537919796, "acc_norm": 0.24875621890547264, "acc_norm_stderr": 0.030567675938916714 }, "hendrycksTest-virology": { "acc": 0.26506024096385544, "acc_stderr": 0.03436024037944967, "acc_norm": 0.25301204819277107, "acc_norm_stderr": 0.03384429155233137 }, "hendrycksTest-business_ethics": { "acc": 0.33, "acc_stderr": 0.04725815626252604, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "hendrycksTest-moral_scenarios": { "acc": 0.24134078212290502, "acc_stderr": 0.014310999547961443, "acc_norm": 0.24916201117318434, "acc_norm_stderr": 0.01446589382985993 }, "hendrycksTest-high_school_us_history": { "acc": 0.24509803921568626, "acc_stderr": 0.03019028245350194, "acc_norm": 0.2696078431372549, "acc_norm_stderr": 0.031145570659486782 }, "hendrycksTest-machine_learning": { "acc": 0.24107142857142858, "acc_stderr": 0.040598672469526864, "acc_norm": 0.23214285714285715, "acc_norm_stderr": 0.04007341809755807 }, "hendrycksTest-global_facts": { "acc": 0.24, "acc_stderr": 0.042923469599092816, "acc_norm": 0.24, "acc_norm_stderr": 0.042923469599092816 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.041633319989322695, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "crows_pairs_french": { "likelihood_difference": 5.343344700357782, "likelihood_difference_stderr": 0.1361582396406217, "pct_stereotype": 0.46153846153846156, "pct_stereotype_stderr": 0.012177111585868348 }, "crows_pairs_french_disability": { "likelihood_difference": 6.572443181818182, "likelihood_difference_stderr": 0.7053414331507932, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.06176056549879611 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.519458912037037, "likelihood_difference_stderr": 0.26480319253010903, "pct_stereotype": 0.4305555555555556, "pct_stereotype_stderr": 0.03376922151252336 }, "hendrycksTest-world_religions": { "acc": 0.23391812865497075, "acc_stderr": 0.03246721765117825, "acc_norm": 0.26900584795321636, "acc_norm_stderr": 0.03401052620104089 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 4.016694078947369, "likelihood_difference_stderr": 0.25038749794237203, "pct_stereotype": 0.6368421052631579, "pct_stereotype_stderr": 0.03498104083833203 }, "hendrycksTest-professional_medicine": { "acc": 0.23161764705882354, "acc_stderr": 0.025626533803777562, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.026799562024887674 }, "hendrycksTest-anatomy": { "acc": 0.2740740740740741, "acc_stderr": 0.03853254836552003, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.037857144650666544 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.24871794871794872, "acc_stderr": 0.0219169577092138, "acc_norm": 0.26666666666666666, "acc_norm_stderr": 0.022421273612923714 }, "hendrycksTest-medical_genetics": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "hendrycksTest-conceptual_physics": { "acc": 0.2723404255319149, "acc_stderr": 0.0291012906983867, "acc_norm": 0.20425531914893616, "acc_norm_stderr": 0.02635515841334942 } }, "versions": { "crows_pairs_english_age": 0, "hendrycksTest-college_mathematics": 0, "hendrycksTest-college_medicine": 0, "crows_pairs_english_autre": 0, "hendrycksTest-college_physics": 0, "hendrycksTest-prehistory": 0, "hendrycksTest-college_computer_science": 0, "hendrycksTest-professional_psychology": 0, "hendrycksTest-management": 0, "crows_pairs_french_sexual_orientation": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-philosophy": 0, "logiqa": 0, "crows_pairs_french_autre": 0, "crows_pairs_english_disability": 0, "arc_challenge": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-international_law": 0, "hendrycksTest-formal_logic": 0, "hendrycksTest-miscellaneous": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-human_sexuality": 0, "crows_pairs_english_physical_appearance": 0, "crows_pairs_english": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-clinical_knowledge": 0, "arc_easy": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-nutrition": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-econometrics": 0, "crows_pairs_french_age": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_french_religion": 0, "hendrycksTest-electrical_engineering": 0, "crows_pairs_french_race_color": 0, "crows_pairs_french_socioeconomic": 0, "hendrycksTest-security_studies": 0, "wsc": 0, "hendrycksTest-high_school_psychology": 0, "crows_pairs_french_gender": 0, "lambada_openai": 0, "winogrande": 0, "hendrycksTest-marketing": 0, "hendrycksTest-public_relations": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-astronomy": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-high_school_chemistry": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_physics": 0, "piqa": 0, "hendrycksTest-college_chemistry": 0, "hendrycksTest-human_aging": 0, "crows_pairs_english_religion": 0, "hendrycksTest-professional_law": 0, "crows_pairs_english_sexual_orientation": 0, "crows_pairs_english_gender": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-elementary_mathematics": 0, "sciq": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-high_school_geography": 0, "hendrycksTest-sociology": 0, "hendrycksTest-virology": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-machine_learning": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-computer_security": 0, "crows_pairs_french": 0, "crows_pairs_french_disability": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-world_religions": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-high_school_macroeconomics": 0, "hendrycksTest-medical_genetics": 0, "hendrycksTest-conceptual_physics": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step123000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:4", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }