karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
22 kB
{
"results": {
"crows_pairs_french": {
"likelihood_difference": 6.341835308586762,
"likelihood_difference_stderr": 0.15048967426229426,
"pct_stereotype": 0.49314251639833034,
"pct_stereotype_stderr": 0.012212150501851282
},
"hendrycksTest-virology": {
"acc": 0.2289156626506024,
"acc_stderr": 0.03270745277352477,
"acc_norm": 0.24096385542168675,
"acc_norm_stderr": 0.03329394119073528
},
"hendrycksTest-econometrics": {
"acc": 0.2631578947368421,
"acc_stderr": 0.0414243971948936,
"acc_norm": 0.2982456140350877,
"acc_norm_stderr": 0.04303684033537315
},
"hendrycksTest-high_school_psychology": {
"acc": 0.23669724770642203,
"acc_stderr": 0.01822407811729908,
"acc_norm": 0.25137614678899084,
"acc_norm_stderr": 0.018599206360287415
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 4.267434210526316,
"likelihood_difference_stderr": 0.33123322148424716,
"pct_stereotype": 0.5210526315789473,
"pct_stereotype_stderr": 0.03633739504773335
},
"hendrycksTest-security_studies": {
"acc": 0.3224489795918367,
"acc_stderr": 0.029923100563683913,
"acc_norm": 0.2,
"acc_norm_stderr": 0.025607375986579157
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2128205128205128,
"acc_stderr": 0.020752423722128006,
"acc_norm": 0.24615384615384617,
"acc_norm_stderr": 0.021840866990423088
},
"crows_pairs_english_religion": {
"likelihood_difference": 3.94481981981982,
"likelihood_difference_stderr": 0.43861537266600115,
"pct_stereotype": 0.5135135135135135,
"pct_stereotype_stderr": 0.04765571461988585
},
"hendrycksTest-high_school_geography": {
"acc": 0.23737373737373738,
"acc_stderr": 0.030313710538198892,
"acc_norm": 0.26262626262626265,
"acc_norm_stderr": 0.03135305009533087
},
"crows_pairs_english": {
"likelihood_difference": 4.10218209600477,
"likelihood_difference_stderr": 0.1236077959409224,
"pct_stereotype": 0.48598688133571855,
"pct_stereotype_stderr": 0.012208501686447064
},
"logiqa": {
"acc": 0.19201228878648233,
"acc_stderr": 0.01544934998590095,
"acc_norm": 0.22427035330261136,
"acc_norm_stderr": 0.016360043348265515
},
"hendrycksTest-professional_medicine": {
"acc": 0.22794117647058823,
"acc_stderr": 0.025483081468029804,
"acc_norm": 0.22426470588235295,
"acc_norm_stderr": 0.025336848563332348
},
"hendrycksTest-moral_disputes": {
"acc": 0.22254335260115607,
"acc_stderr": 0.02239421566194282,
"acc_norm": 0.17630057803468208,
"acc_norm_stderr": 0.02051642567249071
},
"crows_pairs_french_age": {
"likelihood_difference": 3.9694444444444446,
"likelihood_difference_stderr": 0.42986839067167343,
"pct_stereotype": 0.4444444444444444,
"pct_stereotype_stderr": 0.052671718126664185
},
"crows_pairs_french_gender": {
"likelihood_difference": 4.424454828660436,
"likelihood_difference_stderr": 0.22752081870537358,
"pct_stereotype": 0.5389408099688473,
"pct_stereotype_stderr": 0.027865952192986033
},
"hendrycksTest-college_biology": {
"acc": 0.2222222222222222,
"acc_stderr": 0.034765901043041336,
"acc_norm": 0.24305555555555555,
"acc_norm_stderr": 0.03586879280080342
},
"hendrycksTest-high_school_physics": {
"acc": 0.2185430463576159,
"acc_stderr": 0.03374235550425694,
"acc_norm": 0.25165562913907286,
"acc_norm_stderr": 0.035433042343899844
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 6.619357638888889,
"likelihood_difference_stderr": 0.8210291220143636,
"pct_stereotype": 0.5555555555555556,
"pct_stereotype_stderr": 0.05897165471491952
},
"crows_pairs_french_disability": {
"likelihood_difference": 8.846590909090908,
"likelihood_difference_stderr": 0.9260560084979663,
"pct_stereotype": 0.4090909090909091,
"pct_stereotype_stderr": 0.060983672113630656
},
"hendrycksTest-global_facts": {
"acc": 0.26,
"acc_stderr": 0.04408440022768079,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 6.335817920918367,
"likelihood_difference_stderr": 0.49605368420562174,
"pct_stereotype": 0.3877551020408163,
"pct_stereotype_stderr": 0.03489185364347385
},
"hendrycksTest-jurisprudence": {
"acc": 0.18518518518518517,
"acc_stderr": 0.03755265865037181,
"acc_norm": 0.18518518518518517,
"acc_norm_stderr": 0.03755265865037182
},
"hendrycksTest-machine_learning": {
"acc": 0.24107142857142858,
"acc_stderr": 0.04059867246952686,
"acc_norm": 0.2767857142857143,
"acc_norm_stderr": 0.042466243366976256
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.14074074074074075,
"acc_stderr": 0.0212029303435688,
"acc_norm": 0.2,
"acc_norm_stderr": 0.024388430433987664
},
"hendrycksTest-human_sexuality": {
"acc": 0.2900763358778626,
"acc_stderr": 0.03980066246467765,
"acc_norm": 0.31297709923664124,
"acc_norm_stderr": 0.04066962905677698
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.23,
"acc_norm_stderr": 0.042295258468165044
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.18652849740932642,
"acc_stderr": 0.02811209121011746,
"acc_norm": 0.2538860103626943,
"acc_norm_stderr": 0.03141024780565319
},
"hendrycksTest-professional_law": {
"acc": 0.2379400260756193,
"acc_stderr": 0.010875700787694231,
"acc_norm": 0.26988265971316816,
"acc_norm_stderr": 0.011337381084250423
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.2,
"acc_stderr": 0.024618298195866518,
"acc_norm": 0.2943396226415094,
"acc_norm_stderr": 0.028049186315695245
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.20689655172413793,
"acc_stderr": 0.02850137816789395,
"acc_norm": 0.2512315270935961,
"acc_norm_stderr": 0.030516530732694436
},
"hendrycksTest-management": {
"acc": 0.22330097087378642,
"acc_stderr": 0.04123553189891431,
"acc_norm": 0.24271844660194175,
"acc_norm_stderr": 0.04245022486384495
},
"hendrycksTest-astronomy": {
"acc": 0.21710526315789475,
"acc_stderr": 0.03355045304882924,
"acc_norm": 0.2894736842105263,
"acc_norm_stderr": 0.03690677986137282
},
"hendrycksTest-high_school_european_history": {
"acc": 0.1393939393939394,
"acc_stderr": 0.0270459488258654,
"acc_norm": 0.23636363636363636,
"acc_norm_stderr": 0.03317505930009182
},
"hendrycksTest-college_medicine": {
"acc": 0.24277456647398843,
"acc_stderr": 0.0326926380614177,
"acc_norm": 0.2254335260115607,
"acc_norm_stderr": 0.03186209851641143
},
"hendrycksTest-college_physics": {
"acc": 0.16666666666666666,
"acc_stderr": 0.03708284662416542,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.042801058373643966
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.21008403361344538,
"acc_stderr": 0.026461398717471874,
"acc_norm": 0.27310924369747897,
"acc_norm_stderr": 0.02894200404099817
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 3.7877604166666665,
"likelihood_difference_stderr": 0.4734800383795231,
"pct_stereotype": 0.5138888888888888,
"pct_stereotype_stderr": 0.05931618532716555
},
"hendrycksTest-abstract_algebra": {
"acc": 0.19,
"acc_stderr": 0.03942772444036625,
"acc_norm": 0.2,
"acc_norm_stderr": 0.04020151261036843
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2275132275132275,
"acc_stderr": 0.021591269407823774,
"acc_norm": 0.2222222222222222,
"acc_norm_stderr": 0.021411684393694185
},
"hendrycksTest-human_aging": {
"acc": 0.3452914798206278,
"acc_stderr": 0.03191100192835794,
"acc_norm": 0.2825112107623318,
"acc_norm_stderr": 0.030216831011508773
},
"winogrande": {
"acc": 0.5098658247829518,
"acc_stderr": 0.014049749833367592
},
"crows_pairs_english_disability": {
"likelihood_difference": 6.442788461538462,
"likelihood_difference_stderr": 0.7741982131043712,
"pct_stereotype": 0.5230769230769231,
"pct_stereotype_stderr": 0.06243339646441512
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 14.371565934065934,
"likelihood_difference_stderr": 1.0132795779676502,
"pct_stereotype": 0.8021978021978022,
"pct_stereotype_stderr": 0.04198895203196222
},
"hendrycksTest-high_school_biology": {
"acc": 0.22258064516129034,
"acc_stderr": 0.023664216671642514,
"acc_norm": 0.24516129032258063,
"acc_norm_stderr": 0.024472243840895525
},
"arc_challenge": {
"acc": 0.18003412969283278,
"acc_stderr": 0.011227856729050028,
"acc_norm": 0.2175767918088737,
"acc_norm_stderr": 0.012057262020972499
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.2,
"acc_stderr": 0.04020151261036845,
"acc_norm": 0.24,
"acc_norm_stderr": 0.04292346959909283
},
"hendrycksTest-college_computer_science": {
"acc": 0.24,
"acc_stderr": 0.04292346959909282,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036624
},
"lambada_openai": {
"ppl": 116756.33428953367,
"ppl_stderr": 6456.789280142739,
"acc": 0.0,
"acc_stderr": 0.0
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 4.444220430107527,
"likelihood_difference_stderr": 0.6472437756111237,
"pct_stereotype": 0.7096774193548387,
"pct_stereotype_stderr": 0.04732351421824121
},
"hendrycksTest-high_school_statistics": {
"acc": 0.25462962962962965,
"acc_stderr": 0.02971127586000536,
"acc_norm": 0.24074074074074073,
"acc_norm_stderr": 0.029157522184605596
},
"hendrycksTest-computer_security": {
"acc": 0.27,
"acc_stderr": 0.04461960433384739,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"crows_pairs_french_race_color": {
"likelihood_difference": 4.9554347826086955,
"likelihood_difference_stderr": 0.22275405195298537,
"pct_stereotype": 0.5260869565217391,
"pct_stereotype_stderr": 0.0233062153668594
},
"crows_pairs_english_age": {
"likelihood_difference": 3.65625,
"likelihood_difference_stderr": 0.4765308636339587,
"pct_stereotype": 0.38461538461538464,
"pct_stereotype_stderr": 0.051282051282051246
},
"hendrycksTest-philosophy": {
"acc": 0.22186495176848875,
"acc_stderr": 0.02359885829286305,
"acc_norm": 0.2797427652733119,
"acc_norm_stderr": 0.0254942593506949
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2827586206896552,
"acc_stderr": 0.037528339580033376,
"acc_norm": 0.2689655172413793,
"acc_norm_stderr": 0.036951833116502325
},
"hendrycksTest-sociology": {
"acc": 0.21393034825870647,
"acc_stderr": 0.028996909693328927,
"acc_norm": 0.21393034825870647,
"acc_norm_stderr": 0.02899690969332891
},
"hendrycksTest-professional_accounting": {
"acc": 0.2695035460992908,
"acc_stderr": 0.026469036818590624,
"acc_norm": 0.2730496453900709,
"acc_norm_stderr": 0.02657786094330786
},
"hendrycksTest-miscellaneous": {
"acc": 0.24393358876117496,
"acc_stderr": 0.01535721266582948,
"acc_norm": 0.25287356321839083,
"acc_norm_stderr": 0.015543377313719681
},
"sciq": {
"acc": 0.264,
"acc_stderr": 0.01394627184944047,
"acc_norm": 0.275,
"acc_norm_stderr": 0.014127086556490528
},
"crows_pairs_french_autre": {
"likelihood_difference": 5.8173076923076925,
"likelihood_difference_stderr": 0.8524880376227814,
"pct_stereotype": 0.3076923076923077,
"pct_stereotype_stderr": 0.13323467750529824
},
"hendrycksTest-public_relations": {
"acc": 0.3181818181818182,
"acc_stderr": 0.044612721759105085,
"acc_norm": 0.15454545454545454,
"acc_norm_stderr": 0.03462262571262667
},
"crows_pairs_french_religion": {
"likelihood_difference": 7.920380434782609,
"likelihood_difference_stderr": 0.5131357048721925,
"pct_stereotype": 0.5652173913043478,
"pct_stereotype_stderr": 0.046429222863564275
},
"hendrycksTest-international_law": {
"acc": 0.12396694214876033,
"acc_stderr": 0.030083098716035227,
"acc_norm": 0.2727272727272727,
"acc_norm_stderr": 0.04065578140908705
},
"hendrycksTest-anatomy": {
"acc": 0.22962962962962963,
"acc_stderr": 0.036333844140734664,
"acc_norm": 0.2518518518518518,
"acc_norm_stderr": 0.03749850709174023
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2765957446808511,
"acc_stderr": 0.029241883869628827,
"acc_norm": 0.20425531914893616,
"acc_norm_stderr": 0.026355158413349424
},
"wsc": {
"acc": 0.5192307692307693,
"acc_stderr": 0.049230010729780505
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-medical_genetics": {
"acc": 0.23,
"acc_stderr": 0.042295258468165044,
"acc_norm": 0.22,
"acc_norm_stderr": 0.04163331998932269
},
"piqa": {
"acc": 0.5386289445048966,
"acc_stderr": 0.011630956681145914,
"acc_norm": 0.5244831338411317,
"acc_norm_stderr": 0.011651830225709979
},
"hendrycksTest-college_mathematics": {
"acc": 0.16,
"acc_stderr": 0.03684529491774709,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036625
},
"hendrycksTest-prehistory": {
"acc": 0.27469135802469136,
"acc_stderr": 0.024836057868294688,
"acc_norm": 0.2191358024691358,
"acc_norm_stderr": 0.023016705640262196
},
"hendrycksTest-professional_psychology": {
"acc": 0.2369281045751634,
"acc_stderr": 0.017201662169789782,
"acc_norm": 0.2973856209150327,
"acc_norm_stderr": 0.01849259653639695
},
"hendrycksTest-logical_fallacies": {
"acc": 0.2147239263803681,
"acc_stderr": 0.03226219377286774,
"acc_norm": 0.3128834355828221,
"acc_norm_stderr": 0.036429145782924055
},
"crows_pairs_english_nationality": {
"likelihood_difference": 4.462456597222222,
"likelihood_difference_stderr": 0.33400887699163057,
"pct_stereotype": 0.33796296296296297,
"pct_stereotype_stderr": 0.03225941352631295
},
"hendrycksTest-formal_logic": {
"acc": 0.23015873015873015,
"acc_stderr": 0.03764950879790607,
"acc_norm": 0.25396825396825395,
"acc_norm_stderr": 0.038932596106046755
},
"crows_pairs_english_gender": {
"likelihood_difference": 3.560498046875,
"likelihood_difference_stderr": 0.32984136074752396,
"pct_stereotype": 0.5375,
"pct_stereotype_stderr": 0.02791577963000664
},
"hendrycksTest-high_school_world_history": {
"acc": 0.189873417721519,
"acc_stderr": 0.025530100460233497,
"acc_norm": 0.23628691983122363,
"acc_norm_stderr": 0.02765215314415925
},
"hendrycksTest-college_chemistry": {
"acc": 0.24,
"acc_stderr": 0.042923469599092816,
"acc_norm": 0.21,
"acc_norm_stderr": 0.040936018074033256
},
"hendrycksTest-business_ethics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816506,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-world_religions": {
"acc": 0.14035087719298245,
"acc_stderr": 0.0266405825391332,
"acc_norm": 0.21052631578947367,
"acc_norm_stderr": 0.03126781714663179
},
"crows_pairs_english_autre": {
"likelihood_difference": 6.849431818181818,
"likelihood_difference_stderr": 2.586994276246196,
"pct_stereotype": 0.36363636363636365,
"pct_stereotype_stderr": 0.15212000482437738
},
"hendrycksTest-high_school_us_history": {
"acc": 0.20588235294117646,
"acc_stderr": 0.028379449451588667,
"acc_norm": 0.23529411764705882,
"acc_norm_stderr": 0.02977177522814565
},
"hendrycksTest-nutrition": {
"acc": 0.17973856209150327,
"acc_stderr": 0.021986032182064148,
"acc_norm": 0.27450980392156865,
"acc_norm_stderr": 0.025553169991826517
},
"crows_pairs_french_nationality": {
"likelihood_difference": 7.832756916996048,
"likelihood_difference_stderr": 0.35785767445511346,
"pct_stereotype": 0.3438735177865613,
"pct_stereotype_stderr": 0.029922155720849428
},
"arc_easy": {
"acc": 0.2984006734006734,
"acc_stderr": 0.009388855914040428,
"acc_norm": 0.30134680134680136,
"acc_norm_stderr": 0.0094152598793516
},
"hendrycksTest-marketing": {
"acc": 0.24358974358974358,
"acc_stderr": 0.0281209665039144,
"acc_norm": 0.25213675213675213,
"acc_norm_stderr": 0.02844796547623101
},
"crows_pairs_english_race_color": {
"likelihood_difference": 3.9656434547244093,
"likelihood_difference_stderr": 0.19638996276808376,
"pct_stereotype": 0.468503937007874,
"pct_stereotype_stderr": 0.022161679438492773
}
},
"versions": {
"crows_pairs_french": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-high_school_psychology": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-high_school_geography": 0,
"crows_pairs_english": 0,
"logiqa": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-moral_disputes": 0,
"crows_pairs_french_age": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-high_school_physics": 0,
"crows_pairs_french_physical_appearance": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-global_facts": 0,
"crows_pairs_french_socioeconomic": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-management": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-human_aging": 0,
"winogrande": 0,
"crows_pairs_english_disability": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-high_school_biology": 0,
"arc_challenge": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-college_computer_science": 0,
"lambada_openai": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-computer_security": 0,
"crows_pairs_french_race_color": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-miscellaneous": 0,
"sciq": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-public_relations": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-conceptual_physics": 0,
"wsc": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-medical_genetics": 0,
"piqa": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-logical_fallacies": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-formal_logic": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-world_religions": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-nutrition": 0,
"crows_pairs_french_nationality": 0,
"arc_easy": 0,
"hendrycksTest-marketing": 0,
"crows_pairs_english_race_color": 0
},
"config": {
"model": "hf-causal",
"model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step512",
"num_fewshot": 0,
"batch_size": 32,
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}