karlexmarin's picture
feat: ship paper artefacts + CLI diagnostic alongside browser tool
535348a
raw
history blame
21.9 kB
{
"results": {
"crows_pairs_french_gender": {
"likelihood_difference": 5.687986760124611,
"likelihood_difference_stderr": 0.2761406584883121,
"pct_stereotype": 0.470404984423676,
"pct_stereotype_stderr": 0.02790184442005117
},
"hendrycksTest-marketing": {
"acc": 0.23076923076923078,
"acc_stderr": 0.027601921381417604,
"acc_norm": 0.23076923076923078,
"acc_norm_stderr": 0.027601921381417604
},
"hendrycksTest-high_school_psychology": {
"acc": 0.24587155963302754,
"acc_stderr": 0.01846194096870845,
"acc_norm": 0.26972477064220185,
"acc_norm_stderr": 0.019028486711115445
},
"hendrycksTest-college_chemistry": {
"acc": 0.23,
"acc_stderr": 0.04229525846816507,
"acc_norm": 0.23,
"acc_norm_stderr": 0.042295258468165065
},
"hendrycksTest-abstract_algebra": {
"acc": 0.21,
"acc_stderr": 0.040936018074033256,
"acc_norm": 0.14,
"acc_norm_stderr": 0.03487350880197772
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.1921182266009852,
"acc_stderr": 0.02771931570961477,
"acc_norm": 0.22167487684729065,
"acc_norm_stderr": 0.029225575892489614
},
"hendrycksTest-econometrics": {
"acc": 0.2543859649122807,
"acc_stderr": 0.040969851398436695,
"acc_norm": 0.2719298245614035,
"acc_norm_stderr": 0.04185774424022056
},
"crows_pairs_english_disability": {
"likelihood_difference": 6.718269230769231,
"likelihood_difference_stderr": 0.8596632745046646,
"pct_stereotype": 0.4461538461538462,
"pct_stereotype_stderr": 0.06213651700539812
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.19,
"acc_stderr": 0.03942772444036623,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"crows_pairs_french_nationality": {
"likelihood_difference": 6.856966403162056,
"likelihood_difference_stderr": 0.32916109908316876,
"pct_stereotype": 0.5454545454545454,
"pct_stereotype_stderr": 0.0313666163337434
},
"hendrycksTest-public_relations": {
"acc": 0.2818181818181818,
"acc_stderr": 0.04309118709946458,
"acc_norm": 0.18181818181818182,
"acc_norm_stderr": 0.036942843353377997
},
"crows_pairs_english_age": {
"likelihood_difference": 3.380837912087912,
"likelihood_difference_stderr": 0.49983969692767516,
"pct_stereotype": 0.5164835164835165,
"pct_stereotype_stderr": 0.05267597952306975
},
"logiqa": {
"acc": 0.21044546850998463,
"acc_stderr": 0.015988369488888755,
"acc_norm": 0.23348694316436253,
"acc_norm_stderr": 0.016593362460570887
},
"hendrycksTest-human_aging": {
"acc": 0.3273542600896861,
"acc_stderr": 0.03149384670994131,
"acc_norm": 0.29596412556053814,
"acc_norm_stderr": 0.03063659134869981
},
"hendrycksTest-formal_logic": {
"acc": 0.2698412698412698,
"acc_stderr": 0.03970158273235172,
"acc_norm": 0.30158730158730157,
"acc_norm_stderr": 0.04104947269903394
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.18490566037735848,
"acc_stderr": 0.023893351834464324,
"acc_norm": 0.3169811320754717,
"acc_norm_stderr": 0.02863723563980091
},
"crows_pairs_french_autre": {
"likelihood_difference": 5.5,
"likelihood_difference_stderr": 0.9504975080559196,
"pct_stereotype": 0.5384615384615384,
"pct_stereotype_stderr": 0.14390989949130545
},
"hendrycksTest-moral_disputes": {
"acc": 0.2254335260115607,
"acc_stderr": 0.02249723019096755,
"acc_norm": 0.22832369942196531,
"acc_norm_stderr": 0.022598703804321624
},
"crows_pairs_english_sexual_orientation": {
"likelihood_difference": 5.677083333333333,
"likelihood_difference_stderr": 0.6194240763408452,
"pct_stereotype": 0.6451612903225806,
"pct_stereotype_stderr": 0.049883363937668256
},
"hendrycksTest-professional_psychology": {
"acc": 0.25326797385620914,
"acc_stderr": 0.01759348689536683,
"acc_norm": 0.272875816993464,
"acc_norm_stderr": 0.018020474148393577
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.18907563025210083,
"acc_stderr": 0.025435119438105357,
"acc_norm": 0.2773109243697479,
"acc_norm_stderr": 0.02907937453948001
},
"hendrycksTest-high_school_statistics": {
"acc": 0.2175925925925926,
"acc_stderr": 0.028139689444859645,
"acc_norm": 0.23148148148148148,
"acc_norm_stderr": 0.028765111718046944
},
"crows_pairs_english_gender": {
"likelihood_difference": 4.05078125,
"likelihood_difference_stderr": 0.38152405690444796,
"pct_stereotype": 0.5,
"pct_stereotype_stderr": 0.027994625547792713
},
"wsc": {
"acc": 0.6346153846153846,
"acc_stderr": 0.0474473339327792
},
"hendrycksTest-high_school_us_history": {
"acc": 0.18627450980392157,
"acc_stderr": 0.027325470966716336,
"acc_norm": 0.21568627450980393,
"acc_norm_stderr": 0.028867431449849313
},
"crows_pairs_english_religion": {
"likelihood_difference": 5.009853603603603,
"likelihood_difference_stderr": 0.5228133914951523,
"pct_stereotype": 0.5855855855855856,
"pct_stereotype_stderr": 0.04696953631102271
},
"sciq": {
"acc": 0.228,
"acc_stderr": 0.013273740700804483,
"acc_norm": 0.236,
"acc_norm_stderr": 0.013434451402438685
},
"crows_pairs_english_physical_appearance": {
"likelihood_difference": 4.72265625,
"likelihood_difference_stderr": 0.5793499299137083,
"pct_stereotype": 0.5555555555555556,
"pct_stereotype_stderr": 0.05897165471491952
},
"hendrycksTest-machine_learning": {
"acc": 0.2767857142857143,
"acc_stderr": 0.042466243366976256,
"acc_norm": 0.25,
"acc_norm_stderr": 0.04109974682633932
},
"hendrycksTest-prehistory": {
"acc": 0.28703703703703703,
"acc_stderr": 0.02517104191530968,
"acc_norm": 0.24382716049382716,
"acc_norm_stderr": 0.023891879541959593
},
"hendrycksTest-sociology": {
"acc": 0.23383084577114427,
"acc_stderr": 0.029929415408348384,
"acc_norm": 0.24875621890547264,
"acc_norm_stderr": 0.03056767593891672
},
"hendrycksTest-global_facts": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"crows_pairs_french": {
"likelihood_difference": 6.74689736135957,
"likelihood_difference_stderr": 0.15103608824599826,
"pct_stereotype": 0.5533691115086464,
"pct_stereotype_stderr": 0.012143526564900555
},
"hendrycksTest-medical_genetics": {
"acc": 0.29,
"acc_stderr": 0.045604802157206845,
"acc_norm": 0.28,
"acc_norm_stderr": 0.045126085985421276
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.12222222222222222,
"acc_stderr": 0.019970605780284603,
"acc_norm": 0.1814814814814815,
"acc_norm_stderr": 0.023499264669407282
},
"hendrycksTest-college_biology": {
"acc": 0.2222222222222222,
"acc_stderr": 0.034765901043041336,
"acc_norm": 0.20833333333333334,
"acc_norm_stderr": 0.03396116205845335
},
"hendrycksTest-conceptual_physics": {
"acc": 0.251063829787234,
"acc_stderr": 0.02834696377716246,
"acc_norm": 0.2,
"acc_norm_stderr": 0.0261488180184245
},
"hendrycksTest-moral_scenarios": {
"acc": 0.23798882681564246,
"acc_stderr": 0.014242630070574915,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.014893391735249588
},
"hendrycksTest-jurisprudence": {
"acc": 0.1574074074074074,
"acc_stderr": 0.03520703990517965,
"acc_norm": 0.21296296296296297,
"acc_norm_stderr": 0.039578354719809805
},
"crows_pairs_english": {
"likelihood_difference": 4.661393112701252,
"likelihood_difference_stderr": 0.13998586074905606,
"pct_stereotype": 0.456768038163387,
"pct_stereotype_stderr": 0.012167560197793078
},
"crows_pairs_french_sexual_orientation": {
"likelihood_difference": 13.163461538461538,
"likelihood_difference_stderr": 0.8325716351947234,
"pct_stereotype": 0.7912087912087912,
"pct_stereotype_stderr": 0.042843052065094304
},
"hendrycksTest-management": {
"acc": 0.23300970873786409,
"acc_stderr": 0.04185832598928315,
"acc_norm": 0.2815533980582524,
"acc_norm_stderr": 0.04453254836326467
},
"crows_pairs_english_socioeconomic": {
"likelihood_difference": 4.904440789473684,
"likelihood_difference_stderr": 0.4062917141669697,
"pct_stereotype": 0.48947368421052634,
"pct_stereotype_stderr": 0.036361587723547695
},
"hendrycksTest-logical_fallacies": {
"acc": 0.22085889570552147,
"acc_stderr": 0.032591773927421776,
"acc_norm": 0.3128834355828221,
"acc_norm_stderr": 0.036429145782924055
},
"hendrycksTest-astronomy": {
"acc": 0.20394736842105263,
"acc_stderr": 0.032790004063100495,
"acc_norm": 0.27631578947368424,
"acc_norm_stderr": 0.03639057569952925
},
"crows_pairs_english_autre": {
"likelihood_difference": 6.349431818181818,
"likelihood_difference_stderr": 2.804745680840638,
"pct_stereotype": 0.45454545454545453,
"pct_stereotype_stderr": 0.15745916432444335
},
"hendrycksTest-high_school_world_history": {
"acc": 0.17721518987341772,
"acc_stderr": 0.02485636418450322,
"acc_norm": 0.25738396624472576,
"acc_norm_stderr": 0.028458820991460295
},
"hendrycksTest-professional_medicine": {
"acc": 0.25,
"acc_stderr": 0.026303648393696036,
"acc_norm": 0.25,
"acc_norm_stderr": 0.026303648393696036
},
"hendrycksTest-college_computer_science": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.21,
"acc_norm_stderr": 0.040936018074033256
},
"lambada_openai": {
"ppl": 705314.6370389248,
"ppl_stderr": 50610.68705557734,
"acc": 0.0,
"acc_stderr": 0.0
},
"hendrycksTest-college_medicine": {
"acc": 0.23699421965317918,
"acc_stderr": 0.03242414757483098,
"acc_norm": 0.2658959537572254,
"acc_norm_stderr": 0.03368762932259432
},
"arc_easy": {
"acc": 0.27441077441077444,
"acc_stderr": 0.00915617712224453,
"acc_norm": 0.2849326599326599,
"acc_norm_stderr": 0.009262170695590658
},
"hendrycksTest-security_studies": {
"acc": 0.3306122448979592,
"acc_stderr": 0.030116426296540613,
"acc_norm": 0.20408163265306123,
"acc_norm_stderr": 0.025801283475090506
},
"winogrande": {
"acc": 0.4925019731649566,
"acc_stderr": 0.014050905521228577
},
"crows_pairs_english_nationality": {
"likelihood_difference": 5.4428530092592595,
"likelihood_difference_stderr": 0.3840752204417463,
"pct_stereotype": 0.3333333333333333,
"pct_stereotype_stderr": 0.03214952147802749
},
"arc_challenge": {
"acc": 0.19965870307167236,
"acc_stderr": 0.011681625756888669,
"acc_norm": 0.24146757679180889,
"acc_norm_stderr": 0.01250656483973943
},
"hendrycksTest-computer_security": {
"acc": 0.22,
"acc_stderr": 0.04163331998932268,
"acc_norm": 0.27,
"acc_norm_stderr": 0.044619604333847394
},
"hendrycksTest-world_religions": {
"acc": 0.1695906432748538,
"acc_stderr": 0.028782108105401712,
"acc_norm": 0.25146198830409355,
"acc_norm_stderr": 0.033275044238468436
},
"crows_pairs_french_age": {
"likelihood_difference": 4.167361111111111,
"likelihood_difference_stderr": 0.49130810000225555,
"pct_stereotype": 0.4111111111111111,
"pct_stereotype_stderr": 0.052155640611075534
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.2275132275132275,
"acc_stderr": 0.021591269407823778,
"acc_norm": 0.21164021164021163,
"acc_norm_stderr": 0.02103733150526289
},
"hendrycksTest-international_law": {
"acc": 0.10743801652892562,
"acc_stderr": 0.02826881219254063,
"acc_norm": 0.2396694214876033,
"acc_norm_stderr": 0.03896878985070417
},
"crows_pairs_french_disability": {
"likelihood_difference": 10.162878787878787,
"likelihood_difference_stderr": 1.04556369991972,
"pct_stereotype": 0.3333333333333333,
"pct_stereotype_stderr": 0.0584705346204686
},
"hendrycksTest-miscellaneous": {
"acc": 0.23627075351213284,
"acc_stderr": 0.015190473717037498,
"acc_norm": 0.25287356321839083,
"acc_norm_stderr": 0.015543377313719681
},
"hendrycksTest-high_school_european_history": {
"acc": 0.16363636363636364,
"acc_stderr": 0.028887872395487953,
"acc_norm": 0.24242424242424243,
"acc_norm_stderr": 0.03346409881055953
},
"crows_pairs_french_religion": {
"likelihood_difference": 7.765760869565217,
"likelihood_difference_stderr": 0.49195584086877725,
"pct_stereotype": 0.6869565217391305,
"pct_stereotype_stderr": 0.043432470166108225
},
"hendrycksTest-professional_accounting": {
"acc": 0.25886524822695034,
"acc_stderr": 0.026129572527180848,
"acc_norm": 0.2730496453900709,
"acc_norm_stderr": 0.02657786094330786
},
"hendrycksTest-high_school_geography": {
"acc": 0.18686868686868688,
"acc_stderr": 0.027772533334218977,
"acc_norm": 0.30303030303030304,
"acc_norm_stderr": 0.032742879140268674
},
"hendrycksTest-anatomy": {
"acc": 0.2074074074074074,
"acc_stderr": 0.03502553170678319,
"acc_norm": 0.25925925925925924,
"acc_norm_stderr": 0.03785714465066653
},
"hendrycksTest-philosophy": {
"acc": 0.2379421221864952,
"acc_stderr": 0.02418515064781871,
"acc_norm": 0.2990353697749196,
"acc_norm_stderr": 0.02600330111788513
},
"crows_pairs_english_race_color": {
"likelihood_difference": 4.281742125984252,
"likelihood_difference_stderr": 0.21780058915583433,
"pct_stereotype": 0.3838582677165354,
"pct_stereotype_stderr": 0.021598410071068296
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.19689119170984457,
"acc_stderr": 0.028697873971860674,
"acc_norm": 0.2538860103626943,
"acc_norm_stderr": 0.03141024780565318
},
"hendrycksTest-high_school_physics": {
"acc": 0.2052980132450331,
"acc_stderr": 0.03297986648473836,
"acc_norm": 0.24503311258278146,
"acc_norm_stderr": 0.035118075718047245
},
"crows_pairs_french_socioeconomic": {
"likelihood_difference": 7.983976403061225,
"likelihood_difference_stderr": 0.545579868210259,
"pct_stereotype": 0.34183673469387754,
"pct_stereotype_stderr": 0.033967132039868675
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.19743589743589743,
"acc_stderr": 0.02018264696867484,
"acc_norm": 0.22564102564102564,
"acc_norm_stderr": 0.02119363252514852
},
"hendrycksTest-human_sexuality": {
"acc": 0.29770992366412213,
"acc_stderr": 0.04010358942462203,
"acc_norm": 0.2824427480916031,
"acc_norm_stderr": 0.03948406125768361
},
"hendrycksTest-electrical_engineering": {
"acc": 0.25517241379310346,
"acc_stderr": 0.03632984052707842,
"acc_norm": 0.2689655172413793,
"acc_norm_stderr": 0.036951833116502325
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.24,
"acc_norm_stderr": 0.042923469599092816
},
"crows_pairs_french_race_color": {
"likelihood_difference": 5.3552989130434785,
"likelihood_difference_stderr": 0.2271004698936648,
"pct_stereotype": 0.6869565217391305,
"pct_stereotype_stderr": 0.021645150653106047
},
"piqa": {
"acc": 0.5179542981501633,
"acc_stderr": 0.011658300623287153,
"acc_norm": 0.515778019586507,
"acc_norm_stderr": 0.011660014400426182
},
"hendrycksTest-virology": {
"acc": 0.22289156626506024,
"acc_stderr": 0.03240004825594688,
"acc_norm": 0.25301204819277107,
"acc_norm_stderr": 0.03384429155233137
},
"hendrycksTest-college_mathematics": {
"acc": 0.16,
"acc_stderr": 0.03684529491774708,
"acc_norm": 0.17,
"acc_norm_stderr": 0.0377525168068637
},
"hendrycksTest-high_school_biology": {
"acc": 0.2129032258064516,
"acc_stderr": 0.02328766512726853,
"acc_norm": 0.23870967741935484,
"acc_norm_stderr": 0.024251071262208837
},
"hendrycksTest-professional_law": {
"acc": 0.242503259452412,
"acc_stderr": 0.010946570966348783,
"acc_norm": 0.2711864406779661,
"acc_norm_stderr": 0.011354581451622986
},
"hendrycksTest-college_physics": {
"acc": 0.20588235294117646,
"acc_stderr": 0.04023382273617747,
"acc_norm": 0.19607843137254902,
"acc_norm_stderr": 0.03950581861179962
},
"hendrycksTest-nutrition": {
"acc": 0.19607843137254902,
"acc_stderr": 0.022733789405447593,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.025829163272757482
},
"hendrycksTest-business_ethics": {
"acc": 0.32,
"acc_stderr": 0.04688261722621505,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"crows_pairs_french_physical_appearance": {
"likelihood_difference": 7.185329861111111,
"likelihood_difference_stderr": 0.9560662240150144,
"pct_stereotype": 0.5416666666666666,
"pct_stereotype_stderr": 0.05913268547421809
}
},
"versions": {
"crows_pairs_french_gender": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-econometrics": 0,
"crows_pairs_english_disability": 0,
"hendrycksTest-high_school_computer_science": 0,
"crows_pairs_french_nationality": 0,
"hendrycksTest-public_relations": 0,
"crows_pairs_english_age": 0,
"logiqa": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-clinical_knowledge": 0,
"crows_pairs_french_autre": 0,
"hendrycksTest-moral_disputes": 0,
"crows_pairs_english_sexual_orientation": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_statistics": 0,
"crows_pairs_english_gender": 0,
"wsc": 0,
"hendrycksTest-high_school_us_history": 0,
"crows_pairs_english_religion": 0,
"sciq": 0,
"crows_pairs_english_physical_appearance": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-global_facts": 0,
"crows_pairs_french": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-jurisprudence": 0,
"crows_pairs_english": 0,
"crows_pairs_french_sexual_orientation": 0,
"hendrycksTest-management": 0,
"crows_pairs_english_socioeconomic": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-astronomy": 0,
"crows_pairs_english_autre": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-college_computer_science": 0,
"lambada_openai": 0,
"hendrycksTest-college_medicine": 0,
"arc_easy": 0,
"hendrycksTest-security_studies": 0,
"winogrande": 0,
"crows_pairs_english_nationality": 0,
"arc_challenge": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-world_religions": 0,
"crows_pairs_french_age": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-international_law": 0,
"crows_pairs_french_disability": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-high_school_european_history": 0,
"crows_pairs_french_religion": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-philosophy": 0,
"crows_pairs_english_race_color": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_physics": 0,
"crows_pairs_french_socioeconomic": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-us_foreign_policy": 0,
"crows_pairs_french_race_color": 0,
"piqa": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-business_ethics": 0,
"crows_pairs_french_physical_appearance": 0
},
"config": {
"model": "hf-causal",
"model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step256",
"num_fewshot": 0,
"batch_size": 32,
"device": null,
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}