karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
22 kB
{
"results": {
"crows_pairs_english": {
"likelihood_difference": 3.4712842874180083,
"likelihood_difference_stderr": 0.10356437803282284,
"pct_stereotype": 0.528324388789505,
"pct_stereotype_stderr": 0.012193686719906043
},
"hendrycksTest-international_law": {
"acc": 0.15702479338842976,
"acc_stderr": 0.0332124484254713,
"acc_norm": 0.3884297520661157,
"acc_norm_stderr": 0.04449270350068381
},
"hendrycksTest-sociology": {
"acc": 0.2736318407960199,
"acc_stderr": 0.03152439186555405,
"acc_norm": 0.29850746268656714,
"acc_norm_stderr": 0.032357437893550424
},
"wsc": {
"acc": 0.36538461538461536,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-econometrics": {
"acc": 0.2631578947368421,
"acc_stderr": 0.0414243971948936,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.0414243971948936
},
"hendrycksTest-electrical_engineering": {
"acc": 0.25517241379310346,
"acc_stderr": 0.03632984052707842,
"acc_norm": 0.2827586206896552,
"acc_norm_stderr": 0.03752833958003337
},
"hendrycksTest-business_ethics": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"hendrycksTest-virology": {
"acc": 0.2710843373493976,
"acc_stderr": 0.03460579907553028,
"acc_norm": 0.2289156626506024,
"acc_norm_stderr": 0.03270745277352477
},
"hendrycksTest-nutrition": {
"acc": 0.24836601307189543,
"acc_stderr": 0.024739981355113596,
"acc_norm": 0.32679738562091504,
"acc_norm_stderr": 0.026857294663281423
},
"crows_pairs_french_religion": {
"likelihood_difference": 6.3964673913043475,
"likelihood_difference_stderr": 0.5088370601767548,
"pct_stereotype": 0.45217391304347826,
"pct_stereotype_stderr": 0.04661456979958347
},
"crows_pairs_french": {
"likelihood_difference": 5.3115170319022065,
"likelihood_difference_stderr": 0.13097278073096086,
"pct_stereotype": 0.4502087060226595,
"pct_stereotype_stderr": 0.012152590574174895
},
"crows_pairs_french_gender": {
"likelihood_difference": 4.093165887850467,
"likelihood_difference_stderr": 0.22263346692021055,
"pct_stereotype": 0.5233644859813084,
"pct_stereotype_stderr": 0.027920316348204993
},
"hendrycksTest-security_studies": {
"acc": 0.2693877551020408,
"acc_stderr": 0.02840125202902294,
"acc_norm": 0.22040816326530613,
"acc_norm_stderr": 0.026537045312145277
},
"hendrycksTest-college_computer_science": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036623
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 3.4659288194444446,
"likelihood_difference_stderr": 0.3806466296043766,
"pct_stereotype": 0.5416666666666666,
"pct_stereotype_stderr": 0.05913268547421811
},
"hendrycksTest-professional_medicine": {
"acc": 0.26838235294117646,
"acc_stderr": 0.02691748122437721,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.026799562024887674
},
"hendrycksTest-astronomy": {
"acc": 0.23026315789473684,
"acc_stderr": 0.03426059424403165,
"acc_norm": 0.32894736842105265,
"acc_norm_stderr": 0.03823428969926605
},
"hendrycksTest-high_school_geography": {
"acc": 0.25252525252525254,
"acc_stderr": 0.030954055470365904,
"acc_norm": 0.25757575757575757,
"acc_norm_stderr": 0.031156269519646836
},
"crows_pairs_french_race_color": {
"likelihood_difference": 4.595516304347826,
"likelihood_difference_stderr": 0.23328673581474416,
"pct_stereotype": 0.44130434782608696,
"pct_stereotype_stderr": 0.023176636328300308
},
"hendrycksTest-machine_learning": {
"acc": 0.26785714285714285,
"acc_stderr": 0.04203277291467762,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04109974682633932
},
"crows_pairs_english_religion": {
"likelihood_difference": 3.661739864864865,
"likelihood_difference_stderr": 0.4569099844348636,
"pct_stereotype": 0.6036036036036037,
"pct_stereotype_stderr": 0.04663848326322448
},
"hendrycksTest-professional_accounting": {
"acc": 0.19858156028368795,
"acc_stderr": 0.02379830163794214,
"acc_norm": 0.20567375886524822,
"acc_norm_stderr": 0.024112138950471887
},
"hendrycksTest-medical_genetics": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.34,
"acc_norm_stderr": 0.04760952285695235
},
"piqa": {
"acc": 0.5794341675734495,
"acc_stderr": 0.011517665611282774,
"acc_norm": 0.5837867247007617,
"acc_norm_stderr": 0.011500864675166568
},
"hendrycksTest-miscellaneous": {
"acc": 0.26053639846743293,
"acc_stderr": 0.01569600856380708,
"acc_norm": 0.25287356321839083,
"acc_norm_stderr": 0.015543377313719681
},
"sciq": {
"acc": 0.592,
"acc_stderr": 0.015549205052920676,
"acc_norm": 0.515,
"acc_norm_stderr": 0.015812179641814902
},
"hendrycksTest-professional_psychology": {
"acc": 0.2434640522875817,
"acc_stderr": 0.017362473762146634,
"acc_norm": 0.25,
"acc_norm_stderr": 0.01751781884501444
},
"crows_pairs_english_nationality": {
"likelihood_difference": 3.6025028935185186,
"likelihood_difference_stderr": 0.28513005796161467,
"pct_stereotype": 0.4166666666666667,
"pct_stereotype_stderr": 0.03362277436608043
},
"hendrycksTest-college_mathematics": {
"acc": 0.18,
"acc_stderr": 0.038612291966536955,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909282
},
"crows_pairs_english_gender": {
"likelihood_difference": 2.84423828125,
"likelihood_difference_stderr": 0.2759970404950795,
"pct_stereotype": 0.58125,
"pct_stereotype_stderr": 0.027622536202702143
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.2339622641509434,
"acc_stderr": 0.02605529690115292,
"acc_norm": 0.27169811320754716,
"acc_norm_stderr": 0.027377706624670713
},
"crows_pairs_english_disability": {
"likelihood_difference": 5.205288461538461,
"likelihood_difference_stderr": 0.5700502266857143,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.06231481440776789
},
"hendrycksTest-public_relations": {
"acc": 0.3090909090909091,
"acc_stderr": 0.044262946482000985,
"acc_norm": 0.23636363636363636,
"acc_norm_stderr": 0.040693063197213775
},
"arc_challenge": {
"acc": 0.17918088737201365,
"acc_stderr": 0.011207045216615674,
"acc_norm": 0.21245733788395904,
"acc_norm_stderr": 0.011953482906582952
},
"crows_pairs_english_age": {
"likelihood_difference": 2.5685096153846154,
"likelihood_difference_stderr": 0.2910998803105466,
"pct_stereotype": 0.4945054945054945,
"pct_stereotype_stderr": 0.052701445311128796
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2777777777777778,
"acc_stderr": 0.02306818884826111,
"acc_norm": 0.2804232804232804,
"acc_norm_stderr": 0.023135287974325628
},
"crows_pairs_french_age": {
"likelihood_difference": 3.855208333333333,
"likelihood_difference_stderr": 0.4788440459459206,
"pct_stereotype": 0.45555555555555555,
"pct_stereotype_stderr": 0.05279009646630345
},
"crows_pairs_french_disability": {
"likelihood_difference": 6.4081439393939394,
"likelihood_difference_stderr": 0.5962932736116068,
"pct_stereotype": 0.48484848484848486,
"pct_stereotype_stderr": 0.06198888629778894
},
"winogrande": {
"acc": 0.5082872928176796,
"acc_stderr": 0.014050555322824192
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.3,
"acc_stderr": 0.046056618647183814,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720683
},
"hendrycksTest-professional_law": {
"acc": 0.23728813559322035,
"acc_stderr": 0.010865436690780269,
"acc_norm": 0.2646675358539765,
"acc_norm_stderr": 0.011267332992845528
},
"hendrycksTest-college_chemistry": {
"acc": 0.27,
"acc_stderr": 0.0446196043338474,
"acc_norm": 0.29,
"acc_norm_stderr": 0.045604802157206845
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 3.9288651315789473,
"likelihood_difference_stderr": 0.2802051715338846,
"pct_stereotype": 0.6473684210526316,
"pct_stereotype_stderr": 0.03475405259582098
},
"hendrycksTest-human_sexuality": {
"acc": 0.26717557251908397,
"acc_stderr": 0.03880848301082394,
"acc_norm": 0.25190839694656486,
"acc_norm_stderr": 0.03807387116306086
},
"arc_easy": {
"acc": 0.35058922558922556,
"acc_stderr": 0.00979100382983156,
"acc_norm": 0.3354377104377104,
"acc_norm_stderr": 0.009688175165829592
},
"hendrycksTest-college_physics": {
"acc": 0.19607843137254902,
"acc_stderr": 0.03950581861179963,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.04488482852329017
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 5.685096153846154,
"likelihood_difference_stderr": 0.5726006934973705,
"pct_stereotype": 0.7802197802197802,
"pct_stereotype_stderr": 0.04364972632898534
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.18134715025906736,
"acc_stderr": 0.02780703236068609,
"acc_norm": 0.23834196891191708,
"acc_norm_stderr": 0.030748905363909895
},
"hendrycksTest-human_aging": {
"acc": 0.3094170403587444,
"acc_stderr": 0.03102441174057221,
"acc_norm": 0.27802690582959644,
"acc_norm_stderr": 0.030069584874494033
},
"hendrycksTest-formal_logic": {
"acc": 0.2698412698412698,
"acc_stderr": 0.039701582732351706,
"acc_norm": 0.21428571428571427,
"acc_norm_stderr": 0.03670066451047181
},
"logiqa": {
"acc": 0.22887864823348694,
"acc_stderr": 0.016478107276313284,
"acc_norm": 0.28110599078341014,
"acc_norm_stderr": 0.017632374626460005
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 6.457465277777778,
"likelihood_difference_stderr": 0.6653048237221467,
"pct_stereotype": 0.5138888888888888,
"pct_stereotype_stderr": 0.05931618532716555
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.21428571428571427,
"acc_stderr": 0.02665353159671548,
"acc_norm": 0.31932773109243695,
"acc_norm_stderr": 0.030283995525884396
},
"hendrycksTest-high_school_biology": {
"acc": 0.25483870967741934,
"acc_stderr": 0.02479011845933221,
"acc_norm": 0.2806451612903226,
"acc_norm_stderr": 0.025560604721022895
},
"hendrycksTest-high_school_us_history": {
"acc": 0.22549019607843138,
"acc_stderr": 0.02933116229425172,
"acc_norm": 0.25980392156862747,
"acc_norm_stderr": 0.03077855467869327
},
"hendrycksTest-high_school_world_history": {
"acc": 0.2616033755274262,
"acc_stderr": 0.028609516716994934,
"acc_norm": 0.28270042194092826,
"acc_norm_stderr": 0.029312814153955924
},
"hendrycksTest-high_school_psychology": {
"acc": 0.26605504587155965,
"acc_stderr": 0.018946022322225604,
"acc_norm": 0.26788990825688075,
"acc_norm_stderr": 0.018987462257978652
},
"hendrycksTest-conceptual_physics": {
"acc": 0.23829787234042554,
"acc_stderr": 0.027851252973889778,
"acc_norm": 0.2127659574468085,
"acc_norm_stderr": 0.026754391348039766
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.15270935960591134,
"acc_stderr": 0.025308904539380644,
"acc_norm": 0.28078817733990147,
"acc_norm_stderr": 0.031618563353586086
},
"crows_pairs_french_nationality": {
"likelihood_difference": 8.12685276679842,
"likelihood_difference_stderr": 0.41139398422096635,
"pct_stereotype": 0.233201581027668,
"pct_stereotype_stderr": 0.026638273845497513
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2230769230769231,
"acc_stderr": 0.021107730127243998,
"acc_norm": 0.24615384615384617,
"acc_norm_stderr": 0.02184086699042308
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 4.108198924731183,
"likelihood_difference_stderr": 0.5195853940706195,
"pct_stereotype": 0.7204301075268817,
"pct_stereotype_stderr": 0.046789371667506734
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.16296296296296298,
"acc_stderr": 0.022518561997682648,
"acc_norm": 0.24444444444444444,
"acc_norm_stderr": 0.02620276653465215
},
"crows_pairs_french_autre": {
"likelihood_difference": 2.8365384615384617,
"likelihood_difference_stderr": 0.7093355864720875,
"pct_stereotype": 0.3076923076923077,
"pct_stereotype_stderr": 0.13323467750529824
},
"hendrycksTest-college_medicine": {
"acc": 0.26011560693641617,
"acc_stderr": 0.033450369167889904,
"acc_norm": 0.2774566473988439,
"acc_norm_stderr": 0.034140140070440354
},
"hendrycksTest-management": {
"acc": 0.21359223300970873,
"acc_stderr": 0.040580420156460344,
"acc_norm": 0.2524271844660194,
"acc_norm_stderr": 0.04301250399690877
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542127
},
"hendrycksTest-world_religions": {
"acc": 0.23976608187134502,
"acc_stderr": 0.03274485211946956,
"acc_norm": 0.30409356725146197,
"acc_norm_stderr": 0.03528211258245232
},
"hendrycksTest-moral_scenarios": {
"acc": 0.24692737430167597,
"acc_stderr": 0.014422292204808835,
"acc_norm": 0.25139664804469275,
"acc_norm_stderr": 0.014508979453553972
},
"hendrycksTest-global_facts": {
"acc": 0.19,
"acc_stderr": 0.03942772444036625,
"acc_norm": 0.23,
"acc_norm_stderr": 0.04229525846816506
},
"hendrycksTest-computer_security": {
"acc": 0.22,
"acc_stderr": 0.04163331998932269,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"crows_pairs_english_race_color": {
"likelihood_difference": 3.3907787893700787,
"likelihood_difference_stderr": 0.16817580159222853,
"pct_stereotype": 0.4507874015748031,
"pct_stereotype_stderr": 0.02209795835867595
},
"hendrycksTest-high_school_european_history": {
"acc": 0.21818181818181817,
"acc_stderr": 0.03225078108306289,
"acc_norm": 0.2909090909090909,
"acc_norm_stderr": 0.03546563019624336
},
"hendrycksTest-marketing": {
"acc": 0.2222222222222222,
"acc_stderr": 0.027236013946196708,
"acc_norm": 0.2606837606837607,
"acc_norm_stderr": 0.028760348956523414
},
"hendrycksTest-philosophy": {
"acc": 0.19935691318327975,
"acc_stderr": 0.022691033780549656,
"acc_norm": 0.2829581993569132,
"acc_norm_stderr": 0.025583062489984824
},
"hendrycksTest-high_school_physics": {
"acc": 0.23178807947019867,
"acc_stderr": 0.03445406271987053,
"acc_norm": 0.23178807947019867,
"acc_norm_stderr": 0.03445406271987054
},
"crows_pairs_english_autre": {
"likelihood_difference": 4.900568181818182,
"likelihood_difference_stderr": 1.7545892452142433,
"pct_stereotype": 0.45454545454545453,
"pct_stereotype_stderr": 0.15745916432444335
},
"lambada_openai": {
"ppl": 411.658325603736,
"ppl_stderr": 17.894759386978997,
"acc": 0.12128856976518533,
"acc_stderr": 0.004548258586998434
},
"hendrycksTest-college_biology": {
"acc": 0.2847222222222222,
"acc_stderr": 0.037738099906869334,
"acc_norm": 0.2708333333333333,
"acc_norm_stderr": 0.037161774375660185
},
"hendrycksTest-logical_fallacies": {
"acc": 0.22699386503067484,
"acc_stderr": 0.03291099578615769,
"acc_norm": 0.26380368098159507,
"acc_norm_stderr": 0.03462419931615622
},
"hendrycksTest-abstract_algebra": {
"acc": 0.19,
"acc_stderr": 0.03942772444036623,
"acc_norm": 0.2,
"acc_norm_stderr": 0.040201512610368445
},
"hendrycksTest-anatomy": {
"acc": 0.28888888888888886,
"acc_stderr": 0.03915450630414251,
"acc_norm": 0.25925925925925924,
"acc_norm_stderr": 0.03785714465066655
},
"hendrycksTest-moral_disputes": {
"acc": 0.21098265895953758,
"acc_stderr": 0.021966309947043135,
"acc_norm": 0.3063583815028902,
"acc_norm_stderr": 0.02481835012943659
},
"hendrycksTest-jurisprudence": {
"acc": 0.23148148148148148,
"acc_stderr": 0.04077494709252626,
"acc_norm": 0.35185185185185186,
"acc_norm_stderr": 0.04616631111801713
},
"hendrycksTest-high_school_statistics": {
"acc": 0.19907407407407407,
"acc_stderr": 0.027232298462690242,
"acc_norm": 0.2361111111111111,
"acc_norm_stderr": 0.028963702570791037
},
"hendrycksTest-prehistory": {
"acc": 0.26851851851851855,
"acc_stderr": 0.024659685185967273,
"acc_norm": 0.22839506172839505,
"acc_norm_stderr": 0.023358211840626267
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 4.585817920918367,
"likelihood_difference_stderr": 0.3843067957203524,
"pct_stereotype": 0.4489795918367347,
"pct_stereotype_stderr": 0.03561884533975954
}
},
"versions": {
"crows_pairs_english": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-sociology": 0,
"wsc": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-nutrition": 0,
"crows_pairs_french_religion": 0,
"crows_pairs_french": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-college_computer_science": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-high_school_geography": 0,
"crows_pairs_french_race_color": 0,
"hendrycksTest-machine_learning": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-medical_genetics": 0,
"piqa": 0,
"hendrycksTest-miscellaneous": 0,
"sciq": 0,
"hendrycksTest-professional_psychology": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-college_mathematics": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-clinical_knowledge": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-public_relations": 0,
"arc_challenge": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-elementary_mathematics": 0,
"crows_pairs_french_age": 0,
"crows_pairs_french_disability": 0,
"winogrande": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-college_chemistry": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-human_sexuality": 0,
"arc_easy": 0,
"hendrycksTest-college_physics": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-formal_logic": 0,
"logiqa": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-high_school_chemistry": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-high_school_mathematics": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-management": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-world_religions": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-computer_security": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-high_school_physics": 0,
"crows_pairs_english_autre": 0,
"lambada_openai": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-prehistory": 0,
"crows_pairs_french_socioeconomic": 0
},
"config": {
"model": "hf-causal",
"model_args": "pretrained=EleutherAI/pythia-v1.1-70m,revision=step3000",
"num_fewshot": 0,
"batch_size": 16,
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}