karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
22 kB
{
"results": {
"arc_easy": {
"acc": 0.26641414141414144,
"acc_stderr": 0.009071357971078687,
"acc_norm": 0.2537878787878788,
"acc_norm_stderr": 0.008929657065808295
},
"crows_pairs_french_disability": {
"likelihood_difference": 14.844696969696969,
"likelihood_difference_stderr": 1.4724037604097784,
"pct_stereotype": 0.36363636363636365,
"pct_stereotype_stderr": 0.05966637484671757
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.19327731092436976,
"acc_stderr": 0.025649470265889193,
"acc_norm": 0.2815126050420168,
"acc_norm_stderr": 0.029213549414372163
},
"wsc": {
"acc": 0.6153846153846154,
"acc_stderr": 0.0479366886807504
},
"hendrycksTest-high_school_physics": {
"acc": 0.1986754966887417,
"acc_stderr": 0.03257847384436777,
"acc_norm": 0.31788079470198677,
"acc_norm_stderr": 0.038020397601079024
},
"hendrycksTest-jurisprudence": {
"acc": 0.1574074074074074,
"acc_stderr": 0.03520703990517965,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04186091791394607
},
"hendrycksTest-world_religions": {
"acc": 0.2046783625730994,
"acc_stderr": 0.030944459778533193,
"acc_norm": 0.24561403508771928,
"acc_norm_stderr": 0.03301405946987249
},
"crows_pairs_french": {
"likelihood_difference": 10.034911672629695,
"likelihood_difference_stderr": 0.23396856909749553,
"pct_stereotype": 0.5766249254621347,
"pct_stereotype_stderr": 0.012069029300507982
},
"crows_pairs_french_race_color": {
"likelihood_difference": 9.03695652173913,
"likelihood_difference_stderr": 0.37564611061761194,
"pct_stereotype": 0.6543478260869565,
"pct_stereotype_stderr": 0.02219819363895969
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.22592592592592592,
"acc_stderr": 0.02549753263960955,
"acc_norm": 0.26296296296296295,
"acc_norm_stderr": 0.02684205787383371
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 13.044244260204081,
"likelihood_difference_stderr": 0.8616384783016274,
"pct_stereotype": 0.413265306122449,
"pct_stereotype_stderr": 0.035262902194360866
},
"crows_pairs_english_religion": {
"likelihood_difference": 5.86936936936937,
"likelihood_difference_stderr": 0.712342266308724,
"pct_stereotype": 0.5225225225225225,
"pct_stereotype_stderr": 0.04762473917649626
},
"hendrycksTest-college_computer_science": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036624
},
"hendrycksTest-professional_accounting": {
"acc": 0.2730496453900709,
"acc_stderr": 0.026577860943307857,
"acc_norm": 0.2553191489361702,
"acc_norm_stderr": 0.026011992930902006
},
"hendrycksTest-electrical_engineering": {
"acc": 0.2,
"acc_stderr": 0.0333333333333333,
"acc_norm": 0.18620689655172415,
"acc_norm_stderr": 0.03243946159004616
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 6.25390625,
"likelihood_difference_stderr": 0.8484057131433209,
"pct_stereotype": 0.5277777777777778,
"pct_stereotype_stderr": 0.05924743948371487
},
"crows_pairs_french_autre": {
"likelihood_difference": 7.038461538461538,
"likelihood_difference_stderr": 1.9443187814541079,
"pct_stereotype": 0.38461538461538464,
"pct_stereotype_stderr": 0.1404416814115811
},
"hendrycksTest-high_school_world_history": {
"acc": 0.17721518987341772,
"acc_stderr": 0.024856364184503217,
"acc_norm": 0.2109704641350211,
"acc_norm_stderr": 0.02655837250266192
},
"hendrycksTest-college_chemistry": {
"acc": 0.17,
"acc_stderr": 0.0377525168068637,
"acc_norm": 0.19,
"acc_norm_stderr": 0.03942772444036623
},
"lambada_openai": {
"ppl": 3646061.8456566,
"ppl_stderr": 355236.72805614787,
"acc": 0.0,
"acc_stderr": 0.0
},
"crows_pairs_french_age": {
"likelihood_difference": 5.894444444444445,
"likelihood_difference_stderr": 0.7650007372090334,
"pct_stereotype": 0.6555555555555556,
"pct_stereotype_stderr": 0.050369697187736755
},
"crows_pairs_english_race_color": {
"likelihood_difference": 5.904281496062992,
"likelihood_difference_stderr": 0.36247868571395325,
"pct_stereotype": 0.38188976377952755,
"pct_stereotype_stderr": 0.021577344577442634
},
"hendrycksTest-high_school_european_history": {
"acc": 0.17575757575757575,
"acc_stderr": 0.02972094300622445,
"acc_norm": 0.18181818181818182,
"acc_norm_stderr": 0.030117688929503585
},
"piqa": {
"acc": 0.5239390642002176,
"acc_stderr": 0.011652445621079262,
"acc_norm": 0.5195865070729053,
"acc_norm_stderr": 0.011656869979288458
},
"hendrycksTest-anatomy": {
"acc": 0.22962962962962963,
"acc_stderr": 0.03633384414073465,
"acc_norm": 0.2962962962962963,
"acc_norm_stderr": 0.03944624162501117
},
"crows_pairs_english": {
"likelihood_difference": 5.7787343470483,
"likelihood_difference_stderr": 0.1995147197033885,
"pct_stereotype": 0.4639236732259988,
"pct_stereotype_stderr": 0.012181466483312616
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.19622641509433963,
"acc_stderr": 0.02444238813110085,
"acc_norm": 0.2943396226415094,
"acc_norm_stderr": 0.02804918631569524
},
"hendrycksTest-high_school_geography": {
"acc": 0.23232323232323232,
"acc_stderr": 0.03008862949021749,
"acc_norm": 0.2777777777777778,
"acc_norm_stderr": 0.03191178226713547
},
"crows_pairs_english_nationality": {
"likelihood_difference": 6.42115162037037,
"likelihood_difference_stderr": 0.536959840207466,
"pct_stereotype": 0.44907407407407407,
"pct_stereotype_stderr": 0.03392238405321617
},
"hendrycksTest-security_studies": {
"acc": 0.3183673469387755,
"acc_stderr": 0.02982253379398205,
"acc_norm": 0.19591836734693877,
"acc_norm_stderr": 0.025409301953225678
},
"hendrycksTest-professional_medicine": {
"acc": 0.2426470588235294,
"acc_stderr": 0.026040662474201264,
"acc_norm": 0.2757352941176471,
"acc_norm_stderr": 0.027146271936625162
},
"hendrycksTest-human_sexuality": {
"acc": 0.2595419847328244,
"acc_stderr": 0.03844876139785271,
"acc_norm": 0.2824427480916031,
"acc_norm_stderr": 0.03948406125768362
},
"hendrycksTest-conceptual_physics": {
"acc": 0.2680851063829787,
"acc_stderr": 0.028957342788342347,
"acc_norm": 0.251063829787234,
"acc_norm_stderr": 0.02834696377716245
},
"hendrycksTest-philosophy": {
"acc": 0.22508038585209003,
"acc_stderr": 0.023720088516179027,
"acc_norm": 0.29260450160771706,
"acc_norm_stderr": 0.02583989833487798
},
"hendrycksTest-public_relations": {
"acc": 0.2545454545454545,
"acc_stderr": 0.041723430387053825,
"acc_norm": 0.18181818181818182,
"acc_norm_stderr": 0.03694284335337798
},
"hendrycksTest-sociology": {
"acc": 0.26865671641791045,
"acc_stderr": 0.031343283582089536,
"acc_norm": 0.2736318407960199,
"acc_norm_stderr": 0.03152439186555402
},
"crows_pairs_english_disability": {
"likelihood_difference": 7.9192307692307695,
"likelihood_difference_stderr": 1.2408777134628783,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.06231481440776789
},
"hendrycksTest-human_aging": {
"acc": 0.28699551569506726,
"acc_stderr": 0.030360379710291936,
"acc_norm": 0.26905829596412556,
"acc_norm_stderr": 0.029763779406874975
},
"hendrycksTest-virology": {
"acc": 0.19879518072289157,
"acc_stderr": 0.03106939026078942,
"acc_norm": 0.21686746987951808,
"acc_norm_stderr": 0.03208284450356365
},
"hendrycksTest-college_physics": {
"acc": 0.23529411764705882,
"acc_stderr": 0.04220773659171451,
"acc_norm": 0.22549019607843138,
"acc_norm_stderr": 0.04158307533083286
},
"hendrycksTest-prehistory": {
"acc": 0.24074074074074073,
"acc_stderr": 0.023788583551658544,
"acc_norm": 0.21604938271604937,
"acc_norm_stderr": 0.022899162918445792
},
"hendrycksTest-abstract_algebra": {
"acc": 0.23,
"acc_stderr": 0.04229525846816507,
"acc_norm": 0.22,
"acc_norm_stderr": 0.041633319989322695
},
"hendrycksTest-astronomy": {
"acc": 0.17105263157894737,
"acc_stderr": 0.030643607071677098,
"acc_norm": 0.23026315789473684,
"acc_norm_stderr": 0.03426059424403165
},
"winogrande": {
"acc": 0.489344909234412,
"acc_stderr": 0.0140492945362904
},
"hendrycksTest-high_school_statistics": {
"acc": 0.24074074074074073,
"acc_stderr": 0.029157522184605593,
"acc_norm": 0.28703703703703703,
"acc_norm_stderr": 0.030851992993257013
},
"hendrycksTest-moral_disputes": {
"acc": 0.2254335260115607,
"acc_stderr": 0.02249723019096755,
"acc_norm": 0.2138728323699422,
"acc_norm_stderr": 0.022075709251757173
},
"hendrycksTest-econometrics": {
"acc": 0.24561403508771928,
"acc_stderr": 0.04049339297748139,
"acc_norm": 0.3157894736842105,
"acc_norm_stderr": 0.04372748290278007
},
"crows_pairs_french_gender": {
"likelihood_difference": 7.683216510903427,
"likelihood_difference_stderr": 0.45391062504811047,
"pct_stereotype": 0.5327102803738317,
"pct_stereotype_stderr": 0.027890972865217984
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2128205128205128,
"acc_stderr": 0.020752423722128016,
"acc_norm": 0.24358974358974358,
"acc_norm_stderr": 0.02176373368417392
},
"hendrycksTest-high_school_us_history": {
"acc": 0.18627450980392157,
"acc_stderr": 0.027325470966716326,
"acc_norm": 0.2647058823529412,
"acc_norm_stderr": 0.030964517926923393
},
"crows_pairs_english_autre": {
"likelihood_difference": 6.198863636363637,
"likelihood_difference_stderr": 3.164913494502804,
"pct_stereotype": 0.45454545454545453,
"pct_stereotype_stderr": 0.15745916432444335
},
"hendrycksTest-nutrition": {
"acc": 0.17973856209150327,
"acc_stderr": 0.021986032182064148,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.024630048979824782
},
"hendrycksTest-logical_fallacies": {
"acc": 0.1656441717791411,
"acc_stderr": 0.029208296231259104,
"acc_norm": 0.2331288343558282,
"acc_norm_stderr": 0.03322015795776741
},
"hendrycksTest-marketing": {
"acc": 0.23076923076923078,
"acc_stderr": 0.027601921381417597,
"acc_norm": 0.2564102564102564,
"acc_norm_stderr": 0.028605953702004243
},
"logiqa": {
"acc": 0.23195084485407066,
"acc_stderr": 0.016555252497925898,
"acc_norm": 0.2488479262672811,
"acc_norm_stderr": 0.016957985904525585
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.23015873015873015,
"acc_stderr": 0.02167921966369314,
"acc_norm": 0.23809523809523808,
"acc_norm_stderr": 0.021935878081184763
},
"hendrycksTest-international_law": {
"acc": 0.09090909090909091,
"acc_stderr": 0.02624319405407387,
"acc_norm": 0.2231404958677686,
"acc_norm_stderr": 0.03800754475228733
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 14.885302197802197,
"likelihood_difference_stderr": 1.0262045022589799,
"pct_stereotype": 0.7912087912087912,
"pct_stereotype_stderr": 0.04284305206509431
},
"hendrycksTest-miscellaneous": {
"acc": 0.26436781609195403,
"acc_stderr": 0.015769984840690518,
"acc_norm": 0.2656449553001277,
"acc_norm_stderr": 0.01579430248788872
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"hendrycksTest-college_biology": {
"acc": 0.2638888888888889,
"acc_stderr": 0.03685651095897532,
"acc_norm": 0.2708333333333333,
"acc_norm_stderr": 0.03716177437566017
},
"hendrycksTest-college_mathematics": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.17,
"acc_norm_stderr": 0.03775251680686371
},
"crows_pairs_english_age": {
"likelihood_difference": 3.793956043956044,
"likelihood_difference_stderr": 0.6669967521493633,
"pct_stereotype": 0.6373626373626373,
"pct_stereotype_stderr": 0.05067669921031868
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.22279792746113988,
"acc_stderr": 0.03003114797764154,
"acc_norm": 0.24870466321243523,
"acc_norm_stderr": 0.03119584087770029
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 10.175347222222221,
"likelihood_difference_stderr": 1.3645954505299696,
"pct_stereotype": 0.5,
"pct_stereotype_stderr": 0.05933908290969268
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-management": {
"acc": 0.23300970873786409,
"acc_stderr": 0.041858325989283136,
"acc_norm": 0.2815533980582524,
"acc_norm_stderr": 0.04453254836326466
},
"hendrycksTest-professional_psychology": {
"acc": 0.21568627450980393,
"acc_stderr": 0.016639319350313264,
"acc_norm": 0.22875816993464052,
"acc_norm_stderr": 0.016992723465466243
},
"arc_challenge": {
"acc": 0.21075085324232082,
"acc_stderr": 0.011918271754852171,
"acc_norm": 0.24488054607508533,
"acc_norm_stderr": 0.012566273985131354
},
"hendrycksTest-professional_law": {
"acc": 0.2333767926988266,
"acc_stderr": 0.010803108481179099,
"acc_norm": 0.24315514993481094,
"acc_norm_stderr": 0.010956556654417355
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 6.031578947368421,
"likelihood_difference_stderr": 0.5493093736965833,
"pct_stereotype": 0.49473684210526314,
"pct_stereotype_stderr": 0.03636763337787883
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.21674876847290642,
"acc_stderr": 0.028990331252516235,
"acc_norm": 0.21674876847290642,
"acc_norm_stderr": 0.028990331252516235
},
"hendrycksTest-formal_logic": {
"acc": 0.2619047619047619,
"acc_stderr": 0.03932537680392872,
"acc_norm": 0.2619047619047619,
"acc_norm_stderr": 0.039325376803928724
},
"hendrycksTest-machine_learning": {
"acc": 0.33035714285714285,
"acc_stderr": 0.04464285714285714,
"acc_norm": 0.32142857142857145,
"acc_norm_stderr": 0.044328040552915185
},
"hendrycksTest-college_medicine": {
"acc": 0.2138728323699422,
"acc_stderr": 0.03126511206173044,
"acc_norm": 0.26011560693641617,
"acc_norm_stderr": 0.033450369167889925
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 5.76377688172043,
"likelihood_difference_stderr": 0.7836824367448326,
"pct_stereotype": 0.45161290322580644,
"pct_stereotype_stderr": 0.051883930752016603
},
"hendrycksTest-computer_security": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"sciq": {
"acc": 0.201,
"acc_stderr": 0.012679107214617322,
"acc_norm": 0.212,
"acc_norm_stderr": 0.012931481864938052
},
"hendrycksTest-high_school_biology": {
"acc": 0.21935483870967742,
"acc_stderr": 0.02354079935872329,
"acc_norm": 0.22258064516129034,
"acc_norm_stderr": 0.02366421667164251
},
"crows_pairs_french_nationality": {
"likelihood_difference": 10.010128458498023,
"likelihood_difference_stderr": 0.5534217267350656,
"pct_stereotype": 0.5770750988142292,
"pct_stereotype_stderr": 0.031120568731718617
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.36,
"acc_norm_stderr": 0.04824181513244218
},
"hendrycksTest-global_facts": {
"acc": 0.28,
"acc_stderr": 0.045126085985421276,
"acc_norm": 0.28,
"acc_norm_stderr": 0.04512608598542126
},
"hendrycksTest-medical_genetics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.22,
"acc_norm_stderr": 0.041633319989322695
},
"crows_pairs_english_gender": {
"likelihood_difference": 4.97685546875,
"likelihood_difference_stderr": 0.5099312963992724,
"pct_stereotype": 0.490625,
"pct_stereotype_stderr": 0.027989704184941015
},
"hendrycksTest-business_ethics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04351941398892446
},
"hendrycksTest-high_school_psychology": {
"acc": 0.25137614678899084,
"acc_stderr": 0.01859920636028741,
"acc_norm": 0.24770642201834864,
"acc_norm_stderr": 0.018508143602547805
},
"crows_pairs_french_religion": {
"likelihood_difference": 12.409239130434782,
"likelihood_difference_stderr": 0.9682588361060674,
"pct_stereotype": 0.6260869565217392,
"pct_stereotype_stderr": 0.045315858286449635
}
},
"versions": {
"arc_easy": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-high_school_microeconomics": 0,
"wsc": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-world_religions": 0,
"crows_pairs_french": 0,
"crows_pairs_french_race_color": 0,
"hendrycksTest-high_school_mathematics": 0,
"crows_pairs_french_socioeconomic": 0,
"crows_pairs_english_religion": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-electrical_engineering": 0,
"crows_pairs_english_physical_appearance": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-college_chemistry": 0,
"lambada_openai": 0,
"crows_pairs_french_age": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-high_school_european_history": 0,
"piqa": 0,
"hendrycksTest-anatomy": 0,
"crows_pairs_english": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-high_school_geography": 0,
"crows_pairs_english_nationality": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-sociology": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-astronomy": 0,
"winogrande": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-econometrics": 0,
"crows_pairs_french_gender": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-high_school_us_history": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-marketing": 0,
"logiqa": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-international_law": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-college_mathematics": 0,
"crows_pairs_english_age": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"crows_pairs_french_physical_appearance": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-management": 0,
"hendrycksTest-professional_psychology": 0,
"arc_challenge": 0,
"hendrycksTest-professional_law": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-college_medicine": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-computer_security": 0,
"sciq": 0,
"hendrycksTest-high_school_biology": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-medical_genetics": 0,
"crows_pairs_english_gender": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-high_school_psychology": 0,
"crows_pairs_french_religion": 0
},
"config": {
"model": "hf-causal",
"model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step8",
"num_fewshot": 0,
"batch_size": 32,
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}