Spaces:
Running
Running
| { | |
| "results": { | |
| "crows_pairs_english": { | |
| "likelihood_difference": 3.4712842874180083, | |
| "likelihood_difference_stderr": 0.10356437803282284, | |
| "pct_stereotype": 0.528324388789505, | |
| "pct_stereotype_stderr": 0.012193686719906043 | |
| }, | |
| "hendrycksTest-international_law": { | |
| "acc": 0.15702479338842976, | |
| "acc_stderr": 0.0332124484254713, | |
| "acc_norm": 0.3884297520661157, | |
| "acc_norm_stderr": 0.04449270350068381 | |
| }, | |
| "hendrycksTest-sociology": { | |
| "acc": 0.2736318407960199, | |
| "acc_stderr": 0.03152439186555405, | |
| "acc_norm": 0.29850746268656714, | |
| "acc_norm_stderr": 0.032357437893550424 | |
| }, | |
| "wsc": { | |
| "acc": 0.36538461538461536, | |
| "acc_stderr": 0.0474473339327792 | |
| }, | |
| "hendrycksTest-econometrics": { | |
| "acc": 0.2631578947368421, | |
| "acc_stderr": 0.0414243971948936, | |
| "acc_norm": 0.2631578947368421, | |
| "acc_norm_stderr": 0.0414243971948936 | |
| }, | |
| "hendrycksTest-electrical_engineering": { | |
| "acc": 0.25517241379310346, | |
| "acc_stderr": 0.03632984052707842, | |
| "acc_norm": 0.2827586206896552, | |
| "acc_norm_stderr": 0.03752833958003337 | |
| }, | |
| "hendrycksTest-business_ethics": { | |
| "acc": 0.27, | |
| "acc_stderr": 0.044619604333847394, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.04351941398892446 | |
| }, | |
| "hendrycksTest-virology": { | |
| "acc": 0.2710843373493976, | |
| "acc_stderr": 0.03460579907553028, | |
| "acc_norm": 0.2289156626506024, | |
| "acc_norm_stderr": 0.03270745277352477 | |
| }, | |
| "hendrycksTest-nutrition": { | |
| "acc": 0.24836601307189543, | |
| "acc_stderr": 0.024739981355113596, | |
| "acc_norm": 0.32679738562091504, | |
| "acc_norm_stderr": 0.026857294663281423 | |
| }, | |
| "crows_pairs_french_religion": { | |
| "likelihood_difference": 6.3964673913043475, | |
| "likelihood_difference_stderr": 0.5088370601767548, | |
| "pct_stereotype": 0.45217391304347826, | |
| "pct_stereotype_stderr": 0.04661456979958347 | |
| }, | |
| "crows_pairs_french": { | |
| "likelihood_difference": 5.3115170319022065, | |
| "likelihood_difference_stderr": 0.13097278073096086, | |
| "pct_stereotype": 0.4502087060226595, | |
| "pct_stereotype_stderr": 0.012152590574174895 | |
| }, | |
| "crows_pairs_french_gender": { | |
| "likelihood_difference": 4.093165887850467, | |
| "likelihood_difference_stderr": 0.22263346692021055, | |
| "pct_stereotype": 0.5233644859813084, | |
| "pct_stereotype_stderr": 0.027920316348204993 | |
| }, | |
| "hendrycksTest-security_studies": { | |
| "acc": 0.2693877551020408, | |
| "acc_stderr": 0.02840125202902294, | |
| "acc_norm": 0.22040816326530613, | |
| "acc_norm_stderr": 0.026537045312145277 | |
| }, | |
| "hendrycksTest-college_computer_science": { | |
| "acc": 0.22, | |
| "acc_stderr": 0.04163331998932269, | |
| "acc_norm": 0.19, | |
| "acc_norm_stderr": 0.03942772444036623 | |
| }, | |
| "crows_pairs_english_physical_appearance": { | |
| "likelihood_difference": 3.4659288194444446, | |
| "likelihood_difference_stderr": 0.3806466296043766, | |
| "pct_stereotype": 0.5416666666666666, | |
| "pct_stereotype_stderr": 0.05913268547421811 | |
| }, | |
| "hendrycksTest-professional_medicine": { | |
| "acc": 0.26838235294117646, | |
| "acc_stderr": 0.02691748122437721, | |
| "acc_norm": 0.2647058823529412, | |
| "acc_norm_stderr": 0.026799562024887674 | |
| }, | |
| "hendrycksTest-astronomy": { | |
| "acc": 0.23026315789473684, | |
| "acc_stderr": 0.03426059424403165, | |
| "acc_norm": 0.32894736842105265, | |
| "acc_norm_stderr": 0.03823428969926605 | |
| }, | |
| "hendrycksTest-high_school_geography": { | |
| "acc": 0.25252525252525254, | |
| "acc_stderr": 0.030954055470365904, | |
| "acc_norm": 0.25757575757575757, | |
| "acc_norm_stderr": 0.031156269519646836 | |
| }, | |
| "crows_pairs_french_race_color": { | |
| "likelihood_difference": 4.595516304347826, | |
| "likelihood_difference_stderr": 0.23328673581474416, | |
| "pct_stereotype": 0.44130434782608696, | |
| "pct_stereotype_stderr": 0.023176636328300308 | |
| }, | |
| "hendrycksTest-machine_learning": { | |
| "acc": 0.26785714285714285, | |
| "acc_stderr": 0.04203277291467762, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.04109974682633932 | |
| }, | |
| "crows_pairs_english_religion": { | |
| "likelihood_difference": 3.661739864864865, | |
| "likelihood_difference_stderr": 0.4569099844348636, | |
| "pct_stereotype": 0.6036036036036037, | |
| "pct_stereotype_stderr": 0.04663848326322448 | |
| }, | |
| "hendrycksTest-professional_accounting": { | |
| "acc": 0.19858156028368795, | |
| "acc_stderr": 0.02379830163794214, | |
| "acc_norm": 0.20567375886524822, | |
| "acc_norm_stderr": 0.024112138950471887 | |
| }, | |
| "hendrycksTest-medical_genetics": { | |
| "acc": 0.25, | |
| "acc_stderr": 0.04351941398892446, | |
| "acc_norm": 0.34, | |
| "acc_norm_stderr": 0.04760952285695235 | |
| }, | |
| "piqa": { | |
| "acc": 0.5794341675734495, | |
| "acc_stderr": 0.011517665611282774, | |
| "acc_norm": 0.5837867247007617, | |
| "acc_norm_stderr": 0.011500864675166568 | |
| }, | |
| "hendrycksTest-miscellaneous": { | |
| "acc": 0.26053639846743293, | |
| "acc_stderr": 0.01569600856380708, | |
| "acc_norm": 0.25287356321839083, | |
| "acc_norm_stderr": 0.015543377313719681 | |
| }, | |
| "sciq": { | |
| "acc": 0.592, | |
| "acc_stderr": 0.015549205052920676, | |
| "acc_norm": 0.515, | |
| "acc_norm_stderr": 0.015812179641814902 | |
| }, | |
| "hendrycksTest-professional_psychology": { | |
| "acc": 0.2434640522875817, | |
| "acc_stderr": 0.017362473762146634, | |
| "acc_norm": 0.25, | |
| "acc_norm_stderr": 0.01751781884501444 | |
| }, | |
| "crows_pairs_english_nationality": { | |
| "likelihood_difference": 3.6025028935185186, | |
| "likelihood_difference_stderr": 0.28513005796161467, | |
| "pct_stereotype": 0.4166666666666667, | |
| "pct_stereotype_stderr": 0.03362277436608043 | |
| }, | |
| "hendrycksTest-college_mathematics": { | |
| "acc": 0.18, | |
| "acc_stderr": 0.038612291966536955, | |
| "acc_norm": 0.24, | |
| "acc_norm_stderr": 0.04292346959909282 | |
| }, | |
| "crows_pairs_english_gender": { | |
| "likelihood_difference": 2.84423828125, | |
| "likelihood_difference_stderr": 0.2759970404950795, | |
| "pct_stereotype": 0.58125, | |
| "pct_stereotype_stderr": 0.027622536202702143 | |
| }, | |
| "hendrycksTest-clinical_knowledge": { | |
| "acc": 0.2339622641509434, | |
| "acc_stderr": 0.02605529690115292, | |
| "acc_norm": 0.27169811320754716, | |
| "acc_norm_stderr": 0.027377706624670713 | |
| }, | |
| "crows_pairs_english_disability": { | |
| "likelihood_difference": 5.205288461538461, | |
| "likelihood_difference_stderr": 0.5700502266857143, | |
| "pct_stereotype": 0.5384615384615384, | |
| "pct_stereotype_stderr": 0.06231481440776789 | |
| }, | |
| "hendrycksTest-public_relations": { | |
| "acc": 0.3090909090909091, | |
| "acc_stderr": 0.044262946482000985, | |
| "acc_norm": 0.23636363636363636, | |
| "acc_norm_stderr": 0.040693063197213775 | |
| }, | |
| "arc_challenge": { | |
| "acc": 0.17918088737201365, | |
| "acc_stderr": 0.011207045216615674, | |
| "acc_norm": 0.21245733788395904, | |
| "acc_norm_stderr": 0.011953482906582952 | |
| }, | |
| "crows_pairs_english_age": { | |
| "likelihood_difference": 2.5685096153846154, | |
| "likelihood_difference_stderr": 0.2910998803105466, | |
| "pct_stereotype": 0.4945054945054945, | |
| "pct_stereotype_stderr": 0.052701445311128796 | |
| }, | |
| "hendrycksTest-elementary_mathematics": { | |
| "acc": 0.2777777777777778, | |
| "acc_stderr": 0.02306818884826111, | |
| "acc_norm": 0.2804232804232804, | |
| "acc_norm_stderr": 0.023135287974325628 | |
| }, | |
| "crows_pairs_french_age": { | |
| "likelihood_difference": 3.855208333333333, | |
| "likelihood_difference_stderr": 0.4788440459459206, | |
| "pct_stereotype": 0.45555555555555555, | |
| "pct_stereotype_stderr": 0.05279009646630345 | |
| }, | |
| "crows_pairs_french_disability": { | |
| "likelihood_difference": 6.4081439393939394, | |
| "likelihood_difference_stderr": 0.5962932736116068, | |
| "pct_stereotype": 0.48484848484848486, | |
| "pct_stereotype_stderr": 0.06198888629778894 | |
| }, | |
| "winogrande": { | |
| "acc": 0.5082872928176796, | |
| "acc_stderr": 0.014050555322824192 | |
| }, | |
| "hendrycksTest-high_school_computer_science": { | |
| "acc": 0.3, | |
| "acc_stderr": 0.046056618647183814, | |
| "acc_norm": 0.29, | |
| "acc_norm_stderr": 0.04560480215720683 | |
| }, | |
| "hendrycksTest-professional_law": { | |
| "acc": 0.23728813559322035, | |
| "acc_stderr": 0.010865436690780269, | |
| "acc_norm": 0.2646675358539765, | |
| "acc_norm_stderr": 0.011267332992845528 | |
| }, | |
| "hendrycksTest-college_chemistry": { | |
| "acc": 0.27, | |
| "acc_stderr": 0.0446196043338474, | |
| "acc_norm": 0.29, | |
| "acc_norm_stderr": 0.045604802157206845 | |
| }, | |
| "crows_pairs_english_socioeconomic": { | |
| "likelihood_difference": 3.9288651315789473, | |
| "likelihood_difference_stderr": 0.2802051715338846, | |
| "pct_stereotype": 0.6473684210526316, | |
| "pct_stereotype_stderr": 0.03475405259582098 | |
| }, | |
| "hendrycksTest-human_sexuality": { | |
| "acc": 0.26717557251908397, | |
| "acc_stderr": 0.03880848301082394, | |
| "acc_norm": 0.25190839694656486, | |
| "acc_norm_stderr": 0.03807387116306086 | |
| }, | |
| "arc_easy": { | |
| "acc": 0.35058922558922556, | |
| "acc_stderr": 0.00979100382983156, | |
| "acc_norm": 0.3354377104377104, | |
| "acc_norm_stderr": 0.009688175165829592 | |
| }, | |
| "hendrycksTest-college_physics": { | |
| "acc": 0.19607843137254902, | |
| "acc_stderr": 0.03950581861179963, | |
| "acc_norm": 0.28431372549019607, | |
| "acc_norm_stderr": 0.04488482852329017 | |
| }, | |
| "crows_pairs_french_sexual_orientation": { | |
| "likelihood_difference": 5.685096153846154, | |
| "likelihood_difference_stderr": 0.5726006934973705, | |
| "pct_stereotype": 0.7802197802197802, | |
| "pct_stereotype_stderr": 0.04364972632898534 | |
| }, | |
| "hendrycksTest-high_school_government_and_politics": { | |
| "acc": 0.18134715025906736, | |
| "acc_stderr": 0.02780703236068609, | |
| "acc_norm": 0.23834196891191708, | |
| "acc_norm_stderr": 0.030748905363909895 | |
| }, | |
| "hendrycksTest-human_aging": { | |
| "acc": 0.3094170403587444, | |
| "acc_stderr": 0.03102441174057221, | |
| "acc_norm": 0.27802690582959644, | |
| "acc_norm_stderr": 0.030069584874494033 | |
| }, | |
| "hendrycksTest-formal_logic": { | |
| "acc": 0.2698412698412698, | |
| "acc_stderr": 0.039701582732351706, | |
| "acc_norm": 0.21428571428571427, | |
| "acc_norm_stderr": 0.03670066451047181 | |
| }, | |
| "logiqa": { | |
| "acc": 0.22887864823348694, | |
| "acc_stderr": 0.016478107276313284, | |
| "acc_norm": 0.28110599078341014, | |
| "acc_norm_stderr": 0.017632374626460005 | |
| }, | |
| "crows_pairs_french_physical_appearance": { | |
| "likelihood_difference": 6.457465277777778, | |
| "likelihood_difference_stderr": 0.6653048237221467, | |
| "pct_stereotype": 0.5138888888888888, | |
| "pct_stereotype_stderr": 0.05931618532716555 | |
| }, | |
| "hendrycksTest-high_school_microeconomics": { | |
| "acc": 0.21428571428571427, | |
| "acc_stderr": 0.02665353159671548, | |
| "acc_norm": 0.31932773109243695, | |
| "acc_norm_stderr": 0.030283995525884396 | |
| }, | |
| "hendrycksTest-high_school_biology": { | |
| "acc": 0.25483870967741934, | |
| "acc_stderr": 0.02479011845933221, | |
| "acc_norm": 0.2806451612903226, | |
| "acc_norm_stderr": 0.025560604721022895 | |
| }, | |
| "hendrycksTest-high_school_us_history": { | |
| "acc": 0.22549019607843138, | |
| "acc_stderr": 0.02933116229425172, | |
| "acc_norm": 0.25980392156862747, | |
| "acc_norm_stderr": 0.03077855467869327 | |
| }, | |
| "hendrycksTest-high_school_world_history": { | |
| "acc": 0.2616033755274262, | |
| "acc_stderr": 0.028609516716994934, | |
| "acc_norm": 0.28270042194092826, | |
| "acc_norm_stderr": 0.029312814153955924 | |
| }, | |
| "hendrycksTest-high_school_psychology": { | |
| "acc": 0.26605504587155965, | |
| "acc_stderr": 0.018946022322225604, | |
| "acc_norm": 0.26788990825688075, | |
| "acc_norm_stderr": 0.018987462257978652 | |
| }, | |
| "hendrycksTest-conceptual_physics": { | |
| "acc": 0.23829787234042554, | |
| "acc_stderr": 0.027851252973889778, | |
| "acc_norm": 0.2127659574468085, | |
| "acc_norm_stderr": 0.026754391348039766 | |
| }, | |
| "hendrycksTest-high_school_chemistry": { | |
| "acc": 0.15270935960591134, | |
| "acc_stderr": 0.025308904539380644, | |
| "acc_norm": 0.28078817733990147, | |
| "acc_norm_stderr": 0.031618563353586086 | |
| }, | |
| "crows_pairs_french_nationality": { | |
| "likelihood_difference": 8.12685276679842, | |
| "likelihood_difference_stderr": 0.41139398422096635, | |
| "pct_stereotype": 0.233201581027668, | |
| "pct_stereotype_stderr": 0.026638273845497513 | |
| }, | |
| "hendrycksTest-high_school_macroeconomics": { | |
| "acc": 0.2230769230769231, | |
| "acc_stderr": 0.021107730127243998, | |
| "acc_norm": 0.24615384615384617, | |
| "acc_norm_stderr": 0.02184086699042308 | |
| }, | |
| "crows_pairs_english_sexual_orientation": { | |
| "likelihood_difference": 4.108198924731183, | |
| "likelihood_difference_stderr": 0.5195853940706195, | |
| "pct_stereotype": 0.7204301075268817, | |
| "pct_stereotype_stderr": 0.046789371667506734 | |
| }, | |
| "hendrycksTest-high_school_mathematics": { | |
| "acc": 0.16296296296296298, | |
| "acc_stderr": 0.022518561997682648, | |
| "acc_norm": 0.24444444444444444, | |
| "acc_norm_stderr": 0.02620276653465215 | |
| }, | |
| "crows_pairs_french_autre": { | |
| "likelihood_difference": 2.8365384615384617, | |
| "likelihood_difference_stderr": 0.7093355864720875, | |
| "pct_stereotype": 0.3076923076923077, | |
| "pct_stereotype_stderr": 0.13323467750529824 | |
| }, | |
| "hendrycksTest-college_medicine": { | |
| "acc": 0.26011560693641617, | |
| "acc_stderr": 0.033450369167889904, | |
| "acc_norm": 0.2774566473988439, | |
| "acc_norm_stderr": 0.034140140070440354 | |
| }, | |
| "hendrycksTest-management": { | |
| "acc": 0.21359223300970873, | |
| "acc_stderr": 0.040580420156460344, | |
| "acc_norm": 0.2524271844660194, | |
| "acc_norm_stderr": 0.04301250399690877 | |
| }, | |
| "hendrycksTest-us_foreign_policy": { | |
| "acc": 0.25, | |
| "acc_stderr": 0.04351941398892446, | |
| "acc_norm": 0.28, | |
| "acc_norm_stderr": 0.04512608598542127 | |
| }, | |
| "hendrycksTest-world_religions": { | |
| "acc": 0.23976608187134502, | |
| "acc_stderr": 0.03274485211946956, | |
| "acc_norm": 0.30409356725146197, | |
| "acc_norm_stderr": 0.03528211258245232 | |
| }, | |
| "hendrycksTest-moral_scenarios": { | |
| "acc": 0.24692737430167597, | |
| "acc_stderr": 0.014422292204808835, | |
| "acc_norm": 0.25139664804469275, | |
| "acc_norm_stderr": 0.014508979453553972 | |
| }, | |
| "hendrycksTest-global_facts": { | |
| "acc": 0.19, | |
| "acc_stderr": 0.03942772444036625, | |
| "acc_norm": 0.23, | |
| "acc_norm_stderr": 0.04229525846816506 | |
| }, | |
| "hendrycksTest-computer_security": { | |
| "acc": 0.22, | |
| "acc_stderr": 0.04163331998932269, | |
| "acc_norm": 0.3, | |
| "acc_norm_stderr": 0.046056618647183814 | |
| }, | |
| "crows_pairs_english_race_color": { | |
| "likelihood_difference": 3.3907787893700787, | |
| "likelihood_difference_stderr": 0.16817580159222853, | |
| "pct_stereotype": 0.4507874015748031, | |
| "pct_stereotype_stderr": 0.02209795835867595 | |
| }, | |
| "hendrycksTest-high_school_european_history": { | |
| "acc": 0.21818181818181817, | |
| "acc_stderr": 0.03225078108306289, | |
| "acc_norm": 0.2909090909090909, | |
| "acc_norm_stderr": 0.03546563019624336 | |
| }, | |
| "hendrycksTest-marketing": { | |
| "acc": 0.2222222222222222, | |
| "acc_stderr": 0.027236013946196708, | |
| "acc_norm": 0.2606837606837607, | |
| "acc_norm_stderr": 0.028760348956523414 | |
| }, | |
| "hendrycksTest-philosophy": { | |
| "acc": 0.19935691318327975, | |
| "acc_stderr": 0.022691033780549656, | |
| "acc_norm": 0.2829581993569132, | |
| "acc_norm_stderr": 0.025583062489984824 | |
| }, | |
| "hendrycksTest-high_school_physics": { | |
| "acc": 0.23178807947019867, | |
| "acc_stderr": 0.03445406271987053, | |
| "acc_norm": 0.23178807947019867, | |
| "acc_norm_stderr": 0.03445406271987054 | |
| }, | |
| "crows_pairs_english_autre": { | |
| "likelihood_difference": 4.900568181818182, | |
| "likelihood_difference_stderr": 1.7545892452142433, | |
| "pct_stereotype": 0.45454545454545453, | |
| "pct_stereotype_stderr": 0.15745916432444335 | |
| }, | |
| "lambada_openai": { | |
| "ppl": 411.658325603736, | |
| "ppl_stderr": 17.894759386978997, | |
| "acc": 0.12128856976518533, | |
| "acc_stderr": 0.004548258586998434 | |
| }, | |
| "hendrycksTest-college_biology": { | |
| "acc": 0.2847222222222222, | |
| "acc_stderr": 0.037738099906869334, | |
| "acc_norm": 0.2708333333333333, | |
| "acc_norm_stderr": 0.037161774375660185 | |
| }, | |
| "hendrycksTest-logical_fallacies": { | |
| "acc": 0.22699386503067484, | |
| "acc_stderr": 0.03291099578615769, | |
| "acc_norm": 0.26380368098159507, | |
| "acc_norm_stderr": 0.03462419931615622 | |
| }, | |
| "hendrycksTest-abstract_algebra": { | |
| "acc": 0.19, | |
| "acc_stderr": 0.03942772444036623, | |
| "acc_norm": 0.2, | |
| "acc_norm_stderr": 0.040201512610368445 | |
| }, | |
| "hendrycksTest-anatomy": { | |
| "acc": 0.28888888888888886, | |
| "acc_stderr": 0.03915450630414251, | |
| "acc_norm": 0.25925925925925924, | |
| "acc_norm_stderr": 0.03785714465066655 | |
| }, | |
| "hendrycksTest-moral_disputes": { | |
| "acc": 0.21098265895953758, | |
| "acc_stderr": 0.021966309947043135, | |
| "acc_norm": 0.3063583815028902, | |
| "acc_norm_stderr": 0.02481835012943659 | |
| }, | |
| "hendrycksTest-jurisprudence": { | |
| "acc": 0.23148148148148148, | |
| "acc_stderr": 0.04077494709252626, | |
| "acc_norm": 0.35185185185185186, | |
| "acc_norm_stderr": 0.04616631111801713 | |
| }, | |
| "hendrycksTest-high_school_statistics": { | |
| "acc": 0.19907407407407407, | |
| "acc_stderr": 0.027232298462690242, | |
| "acc_norm": 0.2361111111111111, | |
| "acc_norm_stderr": 0.028963702570791037 | |
| }, | |
| "hendrycksTest-prehistory": { | |
| "acc": 0.26851851851851855, | |
| "acc_stderr": 0.024659685185967273, | |
| "acc_norm": 0.22839506172839505, | |
| "acc_norm_stderr": 0.023358211840626267 | |
| }, | |
| "crows_pairs_french_socioeconomic": { | |
| "likelihood_difference": 4.585817920918367, | |
| "likelihood_difference_stderr": 0.3843067957203524, | |
| "pct_stereotype": 0.4489795918367347, | |
| "pct_stereotype_stderr": 0.03561884533975954 | |
| } | |
| }, | |
| "versions": { | |
| "crows_pairs_english": 0, | |
| "hendrycksTest-international_law": 0, | |
| "hendrycksTest-sociology": 0, | |
| "wsc": 0, | |
| "hendrycksTest-econometrics": 0, | |
| "hendrycksTest-electrical_engineering": 0, | |
| "hendrycksTest-business_ethics": 0, | |
| "hendrycksTest-virology": 0, | |
| "hendrycksTest-nutrition": 0, | |
| "crows_pairs_french_religion": 0, | |
| "crows_pairs_french": 0, | |
| "crows_pairs_french_gender": 0, | |
| "hendrycksTest-security_studies": 0, | |
| "hendrycksTest-college_computer_science": 0, | |
| "crows_pairs_english_physical_appearance": 0, | |
| "hendrycksTest-professional_medicine": 0, | |
| "hendrycksTest-astronomy": 0, | |
| "hendrycksTest-high_school_geography": 0, | |
| "crows_pairs_french_race_color": 0, | |
| "hendrycksTest-machine_learning": 0, | |
| "crows_pairs_english_religion": 0, | |
| "hendrycksTest-professional_accounting": 0, | |
| "hendrycksTest-medical_genetics": 0, | |
| "piqa": 0, | |
| "hendrycksTest-miscellaneous": 0, | |
| "sciq": 0, | |
| "hendrycksTest-professional_psychology": 0, | |
| "crows_pairs_english_nationality": 0, | |
| "hendrycksTest-college_mathematics": 0, | |
| "crows_pairs_english_gender": 0, | |
| "hendrycksTest-clinical_knowledge": 0, | |
| "crows_pairs_english_disability": 0, | |
| "hendrycksTest-public_relations": 0, | |
| "arc_challenge": 0, | |
| "crows_pairs_english_age": 0, | |
| "hendrycksTest-elementary_mathematics": 0, | |
| "crows_pairs_french_age": 0, | |
| "crows_pairs_french_disability": 0, | |
| "winogrande": 0, | |
| "hendrycksTest-high_school_computer_science": 0, | |
| "hendrycksTest-professional_law": 0, | |
| "hendrycksTest-college_chemistry": 0, | |
| "crows_pairs_english_socioeconomic": 0, | |
| "hendrycksTest-human_sexuality": 0, | |
| "arc_easy": 0, | |
| "hendrycksTest-college_physics": 0, | |
| "crows_pairs_french_sexual_orientation": 0, | |
| "hendrycksTest-high_school_government_and_politics": 0, | |
| "hendrycksTest-human_aging": 0, | |
| "hendrycksTest-formal_logic": 0, | |
| "logiqa": 0, | |
| "crows_pairs_french_physical_appearance": 0, | |
| "hendrycksTest-high_school_microeconomics": 0, | |
| "hendrycksTest-high_school_biology": 0, | |
| "hendrycksTest-high_school_us_history": 0, | |
| "hendrycksTest-high_school_world_history": 0, | |
| "hendrycksTest-high_school_psychology": 0, | |
| "hendrycksTest-conceptual_physics": 0, | |
| "hendrycksTest-high_school_chemistry": 0, | |
| "crows_pairs_french_nationality": 0, | |
| "hendrycksTest-high_school_macroeconomics": 0, | |
| "crows_pairs_english_sexual_orientation": 0, | |
| "hendrycksTest-high_school_mathematics": 0, | |
| "crows_pairs_french_autre": 0, | |
| "hendrycksTest-college_medicine": 0, | |
| "hendrycksTest-management": 0, | |
| "hendrycksTest-us_foreign_policy": 0, | |
| "hendrycksTest-world_religions": 0, | |
| "hendrycksTest-moral_scenarios": 0, | |
| "hendrycksTest-global_facts": 0, | |
| "hendrycksTest-computer_security": 0, | |
| "crows_pairs_english_race_color": 0, | |
| "hendrycksTest-high_school_european_history": 0, | |
| "hendrycksTest-marketing": 0, | |
| "hendrycksTest-philosophy": 0, | |
| "hendrycksTest-high_school_physics": 0, | |
| "crows_pairs_english_autre": 0, | |
| "lambada_openai": 0, | |
| "hendrycksTest-college_biology": 0, | |
| "hendrycksTest-logical_fallacies": 0, | |
| "hendrycksTest-abstract_algebra": 0, | |
| "hendrycksTest-anatomy": 0, | |
| "hendrycksTest-moral_disputes": 0, | |
| "hendrycksTest-jurisprudence": 0, | |
| "hendrycksTest-high_school_statistics": 0, | |
| "hendrycksTest-prehistory": 0, | |
| "crows_pairs_french_socioeconomic": 0 | |
| }, | |
| "config": { | |
| "model": "hf-causal", | |
| "model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step3000", | |
| "num_fewshot": 0, | |
| "batch_size": 16, | |
| "device": "cuda:0", | |
| "no_cache": true, | |
| "limit": null, | |
| "bootstrap_iters": 100000, | |
| "description_dict": {} | |
| } | |
| } |