{ "results": { "crows_pairs_english": { "likelihood_difference": 3.4712842874180083, "likelihood_difference_stderr": 0.10356437803282284, "pct_stereotype": 0.528324388789505, "pct_stereotype_stderr": 0.012193686719906043 }, "hendrycksTest-international_law": { "acc": 0.15702479338842976, "acc_stderr": 0.0332124484254713, "acc_norm": 0.3884297520661157, "acc_norm_stderr": 0.04449270350068381 }, "hendrycksTest-sociology": { "acc": 0.2736318407960199, "acc_stderr": 0.03152439186555405, "acc_norm": 0.29850746268656714, "acc_norm_stderr": 0.032357437893550424 }, "wsc": { "acc": 0.36538461538461536, "acc_stderr": 0.0474473339327792 }, "hendrycksTest-econometrics": { "acc": 0.2631578947368421, "acc_stderr": 0.0414243971948936, "acc_norm": 0.2631578947368421, "acc_norm_stderr": 0.0414243971948936 }, "hendrycksTest-electrical_engineering": { "acc": 0.25517241379310346, "acc_stderr": 0.03632984052707842, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.03752833958003337 }, "hendrycksTest-business_ethics": { "acc": 0.27, "acc_stderr": 0.044619604333847394, "acc_norm": 0.25, "acc_norm_stderr": 0.04351941398892446 }, "hendrycksTest-virology": { "acc": 0.2710843373493976, "acc_stderr": 0.03460579907553028, "acc_norm": 0.2289156626506024, "acc_norm_stderr": 0.03270745277352477 }, "hendrycksTest-nutrition": { "acc": 0.24836601307189543, "acc_stderr": 0.024739981355113596, "acc_norm": 0.32679738562091504, "acc_norm_stderr": 0.026857294663281423 }, "crows_pairs_french_religion": { "likelihood_difference": 6.3964673913043475, "likelihood_difference_stderr": 0.5088370601767548, "pct_stereotype": 0.45217391304347826, "pct_stereotype_stderr": 0.04661456979958347 }, "crows_pairs_french": { "likelihood_difference": 5.3115170319022065, "likelihood_difference_stderr": 0.13097278073096086, "pct_stereotype": 0.4502087060226595, "pct_stereotype_stderr": 0.012152590574174895 }, "crows_pairs_french_gender": { "likelihood_difference": 4.093165887850467, "likelihood_difference_stderr": 0.22263346692021055, "pct_stereotype": 0.5233644859813084, "pct_stereotype_stderr": 0.027920316348204993 }, "hendrycksTest-security_studies": { "acc": 0.2693877551020408, "acc_stderr": 0.02840125202902294, "acc_norm": 0.22040816326530613, "acc_norm_stderr": 0.026537045312145277 }, "hendrycksTest-college_computer_science": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.19, "acc_norm_stderr": 0.03942772444036623 }, "crows_pairs_english_physical_appearance": { "likelihood_difference": 3.4659288194444446, "likelihood_difference_stderr": 0.3806466296043766, "pct_stereotype": 0.5416666666666666, "pct_stereotype_stderr": 0.05913268547421811 }, "hendrycksTest-professional_medicine": { "acc": 0.26838235294117646, "acc_stderr": 0.02691748122437721, "acc_norm": 0.2647058823529412, "acc_norm_stderr": 0.026799562024887674 }, "hendrycksTest-astronomy": { "acc": 0.23026315789473684, "acc_stderr": 0.03426059424403165, "acc_norm": 0.32894736842105265, "acc_norm_stderr": 0.03823428969926605 }, "hendrycksTest-high_school_geography": { "acc": 0.25252525252525254, "acc_stderr": 0.030954055470365904, "acc_norm": 0.25757575757575757, "acc_norm_stderr": 0.031156269519646836 }, "crows_pairs_french_race_color": { "likelihood_difference": 4.595516304347826, "likelihood_difference_stderr": 0.23328673581474416, "pct_stereotype": 0.44130434782608696, "pct_stereotype_stderr": 0.023176636328300308 }, "hendrycksTest-machine_learning": { "acc": 0.26785714285714285, "acc_stderr": 0.04203277291467762, "acc_norm": 0.25, "acc_norm_stderr": 0.04109974682633932 }, "crows_pairs_english_religion": { "likelihood_difference": 3.661739864864865, "likelihood_difference_stderr": 0.4569099844348636, "pct_stereotype": 0.6036036036036037, "pct_stereotype_stderr": 0.04663848326322448 }, "hendrycksTest-professional_accounting": { "acc": 0.19858156028368795, "acc_stderr": 0.02379830163794214, "acc_norm": 0.20567375886524822, "acc_norm_stderr": 0.024112138950471887 }, "hendrycksTest-medical_genetics": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.34, "acc_norm_stderr": 0.04760952285695235 }, "piqa": { "acc": 0.5794341675734495, "acc_stderr": 0.011517665611282774, "acc_norm": 0.5837867247007617, "acc_norm_stderr": 0.011500864675166568 }, "hendrycksTest-miscellaneous": { "acc": 0.26053639846743293, "acc_stderr": 0.01569600856380708, "acc_norm": 0.25287356321839083, "acc_norm_stderr": 0.015543377313719681 }, "sciq": { "acc": 0.592, "acc_stderr": 0.015549205052920676, "acc_norm": 0.515, "acc_norm_stderr": 0.015812179641814902 }, "hendrycksTest-professional_psychology": { "acc": 0.2434640522875817, "acc_stderr": 0.017362473762146634, "acc_norm": 0.25, "acc_norm_stderr": 0.01751781884501444 }, "crows_pairs_english_nationality": { "likelihood_difference": 3.6025028935185186, "likelihood_difference_stderr": 0.28513005796161467, "pct_stereotype": 0.4166666666666667, "pct_stereotype_stderr": 0.03362277436608043 }, "hendrycksTest-college_mathematics": { "acc": 0.18, "acc_stderr": 0.038612291966536955, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282 }, "crows_pairs_english_gender": { "likelihood_difference": 2.84423828125, "likelihood_difference_stderr": 0.2759970404950795, "pct_stereotype": 0.58125, "pct_stereotype_stderr": 0.027622536202702143 }, "hendrycksTest-clinical_knowledge": { "acc": 0.2339622641509434, "acc_stderr": 0.02605529690115292, "acc_norm": 0.27169811320754716, "acc_norm_stderr": 0.027377706624670713 }, "crows_pairs_english_disability": { "likelihood_difference": 5.205288461538461, "likelihood_difference_stderr": 0.5700502266857143, "pct_stereotype": 0.5384615384615384, "pct_stereotype_stderr": 0.06231481440776789 }, "hendrycksTest-public_relations": { "acc": 0.3090909090909091, "acc_stderr": 0.044262946482000985, "acc_norm": 0.23636363636363636, "acc_norm_stderr": 0.040693063197213775 }, "arc_challenge": { "acc": 0.17918088737201365, "acc_stderr": 0.011207045216615674, "acc_norm": 0.21245733788395904, "acc_norm_stderr": 0.011953482906582952 }, "crows_pairs_english_age": { "likelihood_difference": 2.5685096153846154, "likelihood_difference_stderr": 0.2910998803105466, "pct_stereotype": 0.4945054945054945, "pct_stereotype_stderr": 0.052701445311128796 }, "hendrycksTest-elementary_mathematics": { "acc": 0.2777777777777778, "acc_stderr": 0.02306818884826111, "acc_norm": 0.2804232804232804, "acc_norm_stderr": 0.023135287974325628 }, "crows_pairs_french_age": { "likelihood_difference": 3.855208333333333, "likelihood_difference_stderr": 0.4788440459459206, "pct_stereotype": 0.45555555555555555, "pct_stereotype_stderr": 0.05279009646630345 }, "crows_pairs_french_disability": { "likelihood_difference": 6.4081439393939394, "likelihood_difference_stderr": 0.5962932736116068, "pct_stereotype": 0.48484848484848486, "pct_stereotype_stderr": 0.06198888629778894 }, "winogrande": { "acc": 0.5082872928176796, "acc_stderr": 0.014050555322824192 }, "hendrycksTest-high_school_computer_science": { "acc": 0.3, "acc_stderr": 0.046056618647183814, "acc_norm": 0.29, "acc_norm_stderr": 0.04560480215720683 }, "hendrycksTest-professional_law": { "acc": 0.23728813559322035, "acc_stderr": 0.010865436690780269, "acc_norm": 0.2646675358539765, "acc_norm_stderr": 0.011267332992845528 }, "hendrycksTest-college_chemistry": { "acc": 0.27, "acc_stderr": 0.0446196043338474, "acc_norm": 0.29, "acc_norm_stderr": 0.045604802157206845 }, "crows_pairs_english_socioeconomic": { "likelihood_difference": 3.9288651315789473, "likelihood_difference_stderr": 0.2802051715338846, "pct_stereotype": 0.6473684210526316, "pct_stereotype_stderr": 0.03475405259582098 }, "hendrycksTest-human_sexuality": { "acc": 0.26717557251908397, "acc_stderr": 0.03880848301082394, "acc_norm": 0.25190839694656486, "acc_norm_stderr": 0.03807387116306086 }, "arc_easy": { "acc": 0.35058922558922556, "acc_stderr": 0.00979100382983156, "acc_norm": 0.3354377104377104, "acc_norm_stderr": 0.009688175165829592 }, "hendrycksTest-college_physics": { "acc": 0.19607843137254902, "acc_stderr": 0.03950581861179963, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.04488482852329017 }, "crows_pairs_french_sexual_orientation": { "likelihood_difference": 5.685096153846154, "likelihood_difference_stderr": 0.5726006934973705, "pct_stereotype": 0.7802197802197802, "pct_stereotype_stderr": 0.04364972632898534 }, "hendrycksTest-high_school_government_and_politics": { "acc": 0.18134715025906736, "acc_stderr": 0.02780703236068609, "acc_norm": 0.23834196891191708, "acc_norm_stderr": 0.030748905363909895 }, "hendrycksTest-human_aging": { "acc": 0.3094170403587444, "acc_stderr": 0.03102441174057221, "acc_norm": 0.27802690582959644, "acc_norm_stderr": 0.030069584874494033 }, "hendrycksTest-formal_logic": { "acc": 0.2698412698412698, "acc_stderr": 0.039701582732351706, "acc_norm": 0.21428571428571427, "acc_norm_stderr": 0.03670066451047181 }, "logiqa": { "acc": 0.22887864823348694, "acc_stderr": 0.016478107276313284, "acc_norm": 0.28110599078341014, "acc_norm_stderr": 0.017632374626460005 }, "crows_pairs_french_physical_appearance": { "likelihood_difference": 6.457465277777778, "likelihood_difference_stderr": 0.6653048237221467, "pct_stereotype": 0.5138888888888888, "pct_stereotype_stderr": 0.05931618532716555 }, "hendrycksTest-high_school_microeconomics": { "acc": 0.21428571428571427, "acc_stderr": 0.02665353159671548, "acc_norm": 0.31932773109243695, "acc_norm_stderr": 0.030283995525884396 }, "hendrycksTest-high_school_biology": { "acc": 0.25483870967741934, "acc_stderr": 0.02479011845933221, "acc_norm": 0.2806451612903226, "acc_norm_stderr": 0.025560604721022895 }, "hendrycksTest-high_school_us_history": { "acc": 0.22549019607843138, "acc_stderr": 0.02933116229425172, "acc_norm": 0.25980392156862747, "acc_norm_stderr": 0.03077855467869327 }, "hendrycksTest-high_school_world_history": { "acc": 0.2616033755274262, "acc_stderr": 0.028609516716994934, "acc_norm": 0.28270042194092826, "acc_norm_stderr": 0.029312814153955924 }, "hendrycksTest-high_school_psychology": { "acc": 0.26605504587155965, "acc_stderr": 0.018946022322225604, "acc_norm": 0.26788990825688075, "acc_norm_stderr": 0.018987462257978652 }, "hendrycksTest-conceptual_physics": { "acc": 0.23829787234042554, "acc_stderr": 0.027851252973889778, "acc_norm": 0.2127659574468085, "acc_norm_stderr": 0.026754391348039766 }, "hendrycksTest-high_school_chemistry": { "acc": 0.15270935960591134, "acc_stderr": 0.025308904539380644, "acc_norm": 0.28078817733990147, "acc_norm_stderr": 0.031618563353586086 }, "crows_pairs_french_nationality": { "likelihood_difference": 8.12685276679842, "likelihood_difference_stderr": 0.41139398422096635, "pct_stereotype": 0.233201581027668, "pct_stereotype_stderr": 0.026638273845497513 }, "hendrycksTest-high_school_macroeconomics": { "acc": 0.2230769230769231, "acc_stderr": 0.021107730127243998, "acc_norm": 0.24615384615384617, "acc_norm_stderr": 0.02184086699042308 }, "crows_pairs_english_sexual_orientation": { "likelihood_difference": 4.108198924731183, "likelihood_difference_stderr": 0.5195853940706195, "pct_stereotype": 0.7204301075268817, "pct_stereotype_stderr": 0.046789371667506734 }, "hendrycksTest-high_school_mathematics": { "acc": 0.16296296296296298, "acc_stderr": 0.022518561997682648, "acc_norm": 0.24444444444444444, "acc_norm_stderr": 0.02620276653465215 }, "crows_pairs_french_autre": { "likelihood_difference": 2.8365384615384617, "likelihood_difference_stderr": 0.7093355864720875, "pct_stereotype": 0.3076923076923077, "pct_stereotype_stderr": 0.13323467750529824 }, "hendrycksTest-college_medicine": { "acc": 0.26011560693641617, "acc_stderr": 0.033450369167889904, "acc_norm": 0.2774566473988439, "acc_norm_stderr": 0.034140140070440354 }, "hendrycksTest-management": { "acc": 0.21359223300970873, "acc_stderr": 0.040580420156460344, "acc_norm": 0.2524271844660194, "acc_norm_stderr": 0.04301250399690877 }, "hendrycksTest-us_foreign_policy": { "acc": 0.25, "acc_stderr": 0.04351941398892446, "acc_norm": 0.28, "acc_norm_stderr": 0.04512608598542127 }, "hendrycksTest-world_religions": { "acc": 0.23976608187134502, "acc_stderr": 0.03274485211946956, "acc_norm": 0.30409356725146197, "acc_norm_stderr": 0.03528211258245232 }, "hendrycksTest-moral_scenarios": { "acc": 0.24692737430167597, "acc_stderr": 0.014422292204808835, "acc_norm": 0.25139664804469275, "acc_norm_stderr": 0.014508979453553972 }, "hendrycksTest-global_facts": { "acc": 0.19, "acc_stderr": 0.03942772444036625, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816506 }, "hendrycksTest-computer_security": { "acc": 0.22, "acc_stderr": 0.04163331998932269, "acc_norm": 0.3, "acc_norm_stderr": 0.046056618647183814 }, "crows_pairs_english_race_color": { "likelihood_difference": 3.3907787893700787, "likelihood_difference_stderr": 0.16817580159222853, "pct_stereotype": 0.4507874015748031, "pct_stereotype_stderr": 0.02209795835867595 }, "hendrycksTest-high_school_european_history": { "acc": 0.21818181818181817, "acc_stderr": 0.03225078108306289, "acc_norm": 0.2909090909090909, "acc_norm_stderr": 0.03546563019624336 }, "hendrycksTest-marketing": { "acc": 0.2222222222222222, "acc_stderr": 0.027236013946196708, "acc_norm": 0.2606837606837607, "acc_norm_stderr": 0.028760348956523414 }, "hendrycksTest-philosophy": { "acc": 0.19935691318327975, "acc_stderr": 0.022691033780549656, "acc_norm": 0.2829581993569132, "acc_norm_stderr": 0.025583062489984824 }, "hendrycksTest-high_school_physics": { "acc": 0.23178807947019867, "acc_stderr": 0.03445406271987053, "acc_norm": 0.23178807947019867, "acc_norm_stderr": 0.03445406271987054 }, "crows_pairs_english_autre": { "likelihood_difference": 4.900568181818182, "likelihood_difference_stderr": 1.7545892452142433, "pct_stereotype": 0.45454545454545453, "pct_stereotype_stderr": 0.15745916432444335 }, "lambada_openai": { "ppl": 411.658325603736, "ppl_stderr": 17.894759386978997, "acc": 0.12128856976518533, "acc_stderr": 0.004548258586998434 }, "hendrycksTest-college_biology": { "acc": 0.2847222222222222, "acc_stderr": 0.037738099906869334, "acc_norm": 0.2708333333333333, "acc_norm_stderr": 0.037161774375660185 }, "hendrycksTest-logical_fallacies": { "acc": 0.22699386503067484, "acc_stderr": 0.03291099578615769, "acc_norm": 0.26380368098159507, "acc_norm_stderr": 0.03462419931615622 }, "hendrycksTest-abstract_algebra": { "acc": 0.19, "acc_stderr": 0.03942772444036623, "acc_norm": 0.2, "acc_norm_stderr": 0.040201512610368445 }, "hendrycksTest-anatomy": { "acc": 0.28888888888888886, "acc_stderr": 0.03915450630414251, "acc_norm": 0.25925925925925924, "acc_norm_stderr": 0.03785714465066655 }, "hendrycksTest-moral_disputes": { "acc": 0.21098265895953758, "acc_stderr": 0.021966309947043135, "acc_norm": 0.3063583815028902, "acc_norm_stderr": 0.02481835012943659 }, "hendrycksTest-jurisprudence": { "acc": 0.23148148148148148, "acc_stderr": 0.04077494709252626, "acc_norm": 0.35185185185185186, "acc_norm_stderr": 0.04616631111801713 }, "hendrycksTest-high_school_statistics": { "acc": 0.19907407407407407, "acc_stderr": 0.027232298462690242, "acc_norm": 0.2361111111111111, "acc_norm_stderr": 0.028963702570791037 }, "hendrycksTest-prehistory": { "acc": 0.26851851851851855, "acc_stderr": 0.024659685185967273, "acc_norm": 0.22839506172839505, "acc_norm_stderr": 0.023358211840626267 }, "crows_pairs_french_socioeconomic": { "likelihood_difference": 4.585817920918367, "likelihood_difference_stderr": 0.3843067957203524, "pct_stereotype": 0.4489795918367347, "pct_stereotype_stderr": 0.03561884533975954 } }, "versions": { "crows_pairs_english": 0, "hendrycksTest-international_law": 0, "hendrycksTest-sociology": 0, "wsc": 0, "hendrycksTest-econometrics": 0, "hendrycksTest-electrical_engineering": 0, "hendrycksTest-business_ethics": 0, "hendrycksTest-virology": 0, "hendrycksTest-nutrition": 0, "crows_pairs_french_religion": 0, "crows_pairs_french": 0, "crows_pairs_french_gender": 0, "hendrycksTest-security_studies": 0, "hendrycksTest-college_computer_science": 0, "crows_pairs_english_physical_appearance": 0, "hendrycksTest-professional_medicine": 0, "hendrycksTest-astronomy": 0, "hendrycksTest-high_school_geography": 0, "crows_pairs_french_race_color": 0, "hendrycksTest-machine_learning": 0, "crows_pairs_english_religion": 0, "hendrycksTest-professional_accounting": 0, "hendrycksTest-medical_genetics": 0, "piqa": 0, "hendrycksTest-miscellaneous": 0, "sciq": 0, "hendrycksTest-professional_psychology": 0, "crows_pairs_english_nationality": 0, "hendrycksTest-college_mathematics": 0, "crows_pairs_english_gender": 0, "hendrycksTest-clinical_knowledge": 0, "crows_pairs_english_disability": 0, "hendrycksTest-public_relations": 0, "arc_challenge": 0, "crows_pairs_english_age": 0, "hendrycksTest-elementary_mathematics": 0, "crows_pairs_french_age": 0, "crows_pairs_french_disability": 0, "winogrande": 0, "hendrycksTest-high_school_computer_science": 0, "hendrycksTest-professional_law": 0, "hendrycksTest-college_chemistry": 0, "crows_pairs_english_socioeconomic": 0, "hendrycksTest-human_sexuality": 0, "arc_easy": 0, "hendrycksTest-college_physics": 0, "crows_pairs_french_sexual_orientation": 0, "hendrycksTest-high_school_government_and_politics": 0, "hendrycksTest-human_aging": 0, "hendrycksTest-formal_logic": 0, "logiqa": 0, "crows_pairs_french_physical_appearance": 0, "hendrycksTest-high_school_microeconomics": 0, "hendrycksTest-high_school_biology": 0, "hendrycksTest-high_school_us_history": 0, "hendrycksTest-high_school_world_history": 0, "hendrycksTest-high_school_psychology": 0, "hendrycksTest-conceptual_physics": 0, "hendrycksTest-high_school_chemistry": 0, "crows_pairs_french_nationality": 0, "hendrycksTest-high_school_macroeconomics": 0, "crows_pairs_english_sexual_orientation": 0, "hendrycksTest-high_school_mathematics": 0, "crows_pairs_french_autre": 0, "hendrycksTest-college_medicine": 0, "hendrycksTest-management": 0, "hendrycksTest-us_foreign_policy": 0, "hendrycksTest-world_religions": 0, "hendrycksTest-moral_scenarios": 0, "hendrycksTest-global_facts": 0, "hendrycksTest-computer_security": 0, "crows_pairs_english_race_color": 0, "hendrycksTest-high_school_european_history": 0, "hendrycksTest-marketing": 0, "hendrycksTest-philosophy": 0, "hendrycksTest-high_school_physics": 0, "crows_pairs_english_autre": 0, "lambada_openai": 0, "hendrycksTest-college_biology": 0, "hendrycksTest-logical_fallacies": 0, "hendrycksTest-abstract_algebra": 0, "hendrycksTest-anatomy": 0, "hendrycksTest-moral_disputes": 0, "hendrycksTest-jurisprudence": 0, "hendrycksTest-high_school_statistics": 0, "hendrycksTest-prehistory": 0, "crows_pairs_french_socioeconomic": 0 }, "config": { "model": "hf-causal", "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step3000", "num_fewshot": 0, "batch_size": 16, "device": "cuda:0", "no_cache": true, "limit": null, "bootstrap_iters": 100000, "description_dict": {} } }