File size: 21,920 Bytes
535348a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
{
  "results": {
    "crows_pairs_french_gender": {
      "likelihood_difference": 5.687986760124611,
      "likelihood_difference_stderr": 0.2761406584883121,
      "pct_stereotype": 0.470404984423676,
      "pct_stereotype_stderr": 0.02790184442005117
    },
    "hendrycksTest-marketing": {
      "acc": 0.23076923076923078,
      "acc_stderr": 0.027601921381417604,
      "acc_norm": 0.23076923076923078,
      "acc_norm_stderr": 0.027601921381417604
    },
    "hendrycksTest-high_school_psychology": {
      "acc": 0.24587155963302754,
      "acc_stderr": 0.01846194096870845,
      "acc_norm": 0.26972477064220185,
      "acc_norm_stderr": 0.019028486711115445
    },
    "hendrycksTest-college_chemistry": {
      "acc": 0.23,
      "acc_stderr": 0.04229525846816507,
      "acc_norm": 0.23,
      "acc_norm_stderr": 0.042295258468165065
    },
    "hendrycksTest-abstract_algebra": {
      "acc": 0.21,
      "acc_stderr": 0.040936018074033256,
      "acc_norm": 0.14,
      "acc_norm_stderr": 0.03487350880197772
    },
    "hendrycksTest-high_school_chemistry": {
      "acc": 0.1921182266009852,
      "acc_stderr": 0.02771931570961477,
      "acc_norm": 0.22167487684729065,
      "acc_norm_stderr": 0.029225575892489614
    },
    "hendrycksTest-econometrics": {
      "acc": 0.2543859649122807,
      "acc_stderr": 0.040969851398436695,
      "acc_norm": 0.2719298245614035,
      "acc_norm_stderr": 0.04185774424022056
    },
    "crows_pairs_english_disability": {
      "likelihood_difference": 6.718269230769231,
      "likelihood_difference_stderr": 0.8596632745046646,
      "pct_stereotype": 0.4461538461538462,
      "pct_stereotype_stderr": 0.06213651700539812
    },
    "hendrycksTest-high_school_computer_science": {
      "acc": 0.19,
      "acc_stderr": 0.03942772444036623,
      "acc_norm": 0.29,
      "acc_norm_stderr": 0.04560480215720684
    },
    "crows_pairs_french_nationality": {
      "likelihood_difference": 6.856966403162056,
      "likelihood_difference_stderr": 0.32916109908316876,
      "pct_stereotype": 0.5454545454545454,
      "pct_stereotype_stderr": 0.0313666163337434
    },
    "hendrycksTest-public_relations": {
      "acc": 0.2818181818181818,
      "acc_stderr": 0.04309118709946458,
      "acc_norm": 0.18181818181818182,
      "acc_norm_stderr": 0.036942843353377997
    },
    "crows_pairs_english_age": {
      "likelihood_difference": 3.380837912087912,
      "likelihood_difference_stderr": 0.49983969692767516,
      "pct_stereotype": 0.5164835164835165,
      "pct_stereotype_stderr": 0.05267597952306975
    },
    "logiqa": {
      "acc": 0.21044546850998463,
      "acc_stderr": 0.015988369488888755,
      "acc_norm": 0.23348694316436253,
      "acc_norm_stderr": 0.016593362460570887
    },
    "hendrycksTest-human_aging": {
      "acc": 0.3273542600896861,
      "acc_stderr": 0.03149384670994131,
      "acc_norm": 0.29596412556053814,
      "acc_norm_stderr": 0.03063659134869981
    },
    "hendrycksTest-formal_logic": {
      "acc": 0.2698412698412698,
      "acc_stderr": 0.03970158273235172,
      "acc_norm": 0.30158730158730157,
      "acc_norm_stderr": 0.04104947269903394
    },
    "hendrycksTest-clinical_knowledge": {
      "acc": 0.18490566037735848,
      "acc_stderr": 0.023893351834464324,
      "acc_norm": 0.3169811320754717,
      "acc_norm_stderr": 0.02863723563980091
    },
    "crows_pairs_french_autre": {
      "likelihood_difference": 5.5,
      "likelihood_difference_stderr": 0.9504975080559196,
      "pct_stereotype": 0.5384615384615384,
      "pct_stereotype_stderr": 0.14390989949130545
    },
    "hendrycksTest-moral_disputes": {
      "acc": 0.2254335260115607,
      "acc_stderr": 0.02249723019096755,
      "acc_norm": 0.22832369942196531,
      "acc_norm_stderr": 0.022598703804321624
    },
    "crows_pairs_english_sexual_orientation": {
      "likelihood_difference": 5.677083333333333,
      "likelihood_difference_stderr": 0.6194240763408452,
      "pct_stereotype": 0.6451612903225806,
      "pct_stereotype_stderr": 0.049883363937668256
    },
    "hendrycksTest-professional_psychology": {
      "acc": 0.25326797385620914,
      "acc_stderr": 0.01759348689536683,
      "acc_norm": 0.272875816993464,
      "acc_norm_stderr": 0.018020474148393577
    },
    "hendrycksTest-high_school_microeconomics": {
      "acc": 0.18907563025210083,
      "acc_stderr": 0.025435119438105357,
      "acc_norm": 0.2773109243697479,
      "acc_norm_stderr": 0.02907937453948001
    },
    "hendrycksTest-high_school_statistics": {
      "acc": 0.2175925925925926,
      "acc_stderr": 0.028139689444859645,
      "acc_norm": 0.23148148148148148,
      "acc_norm_stderr": 0.028765111718046944
    },
    "crows_pairs_english_gender": {
      "likelihood_difference": 4.05078125,
      "likelihood_difference_stderr": 0.38152405690444796,
      "pct_stereotype": 0.5,
      "pct_stereotype_stderr": 0.027994625547792713
    },
    "wsc": {
      "acc": 0.6346153846153846,
      "acc_stderr": 0.0474473339327792
    },
    "hendrycksTest-high_school_us_history": {
      "acc": 0.18627450980392157,
      "acc_stderr": 0.027325470966716336,
      "acc_norm": 0.21568627450980393,
      "acc_norm_stderr": 0.028867431449849313
    },
    "crows_pairs_english_religion": {
      "likelihood_difference": 5.009853603603603,
      "likelihood_difference_stderr": 0.5228133914951523,
      "pct_stereotype": 0.5855855855855856,
      "pct_stereotype_stderr": 0.04696953631102271
    },
    "sciq": {
      "acc": 0.228,
      "acc_stderr": 0.013273740700804483,
      "acc_norm": 0.236,
      "acc_norm_stderr": 0.013434451402438685
    },
    "crows_pairs_english_physical_appearance": {
      "likelihood_difference": 4.72265625,
      "likelihood_difference_stderr": 0.5793499299137083,
      "pct_stereotype": 0.5555555555555556,
      "pct_stereotype_stderr": 0.05897165471491952
    },
    "hendrycksTest-machine_learning": {
      "acc": 0.2767857142857143,
      "acc_stderr": 0.042466243366976256,
      "acc_norm": 0.25,
      "acc_norm_stderr": 0.04109974682633932
    },
    "hendrycksTest-prehistory": {
      "acc": 0.28703703703703703,
      "acc_stderr": 0.02517104191530968,
      "acc_norm": 0.24382716049382716,
      "acc_norm_stderr": 0.023891879541959593
    },
    "hendrycksTest-sociology": {
      "acc": 0.23383084577114427,
      "acc_stderr": 0.029929415408348384,
      "acc_norm": 0.24875621890547264,
      "acc_norm_stderr": 0.03056767593891672
    },
    "hendrycksTest-global_facts": {
      "acc": 0.27,
      "acc_stderr": 0.044619604333847394,
      "acc_norm": 0.3,
      "acc_norm_stderr": 0.046056618647183814
    },
    "crows_pairs_french": {
      "likelihood_difference": 6.74689736135957,
      "likelihood_difference_stderr": 0.15103608824599826,
      "pct_stereotype": 0.5533691115086464,
      "pct_stereotype_stderr": 0.012143526564900555
    },
    "hendrycksTest-medical_genetics": {
      "acc": 0.29,
      "acc_stderr": 0.045604802157206845,
      "acc_norm": 0.28,
      "acc_norm_stderr": 0.045126085985421276
    },
    "hendrycksTest-high_school_mathematics": {
      "acc": 0.12222222222222222,
      "acc_stderr": 0.019970605780284603,
      "acc_norm": 0.1814814814814815,
      "acc_norm_stderr": 0.023499264669407282
    },
    "hendrycksTest-college_biology": {
      "acc": 0.2222222222222222,
      "acc_stderr": 0.034765901043041336,
      "acc_norm": 0.20833333333333334,
      "acc_norm_stderr": 0.03396116205845335
    },
    "hendrycksTest-conceptual_physics": {
      "acc": 0.251063829787234,
      "acc_stderr": 0.02834696377716246,
      "acc_norm": 0.2,
      "acc_norm_stderr": 0.0261488180184245
    },
    "hendrycksTest-moral_scenarios": {
      "acc": 0.23798882681564246,
      "acc_stderr": 0.014242630070574915,
      "acc_norm": 0.27262569832402234,
      "acc_norm_stderr": 0.014893391735249588
    },
    "hendrycksTest-jurisprudence": {
      "acc": 0.1574074074074074,
      "acc_stderr": 0.03520703990517965,
      "acc_norm": 0.21296296296296297,
      "acc_norm_stderr": 0.039578354719809805
    },
    "crows_pairs_english": {
      "likelihood_difference": 4.661393112701252,
      "likelihood_difference_stderr": 0.13998586074905606,
      "pct_stereotype": 0.456768038163387,
      "pct_stereotype_stderr": 0.012167560197793078
    },
    "crows_pairs_french_sexual_orientation": {
      "likelihood_difference": 13.163461538461538,
      "likelihood_difference_stderr": 0.8325716351947234,
      "pct_stereotype": 0.7912087912087912,
      "pct_stereotype_stderr": 0.042843052065094304
    },
    "hendrycksTest-management": {
      "acc": 0.23300970873786409,
      "acc_stderr": 0.04185832598928315,
      "acc_norm": 0.2815533980582524,
      "acc_norm_stderr": 0.04453254836326467
    },
    "crows_pairs_english_socioeconomic": {
      "likelihood_difference": 4.904440789473684,
      "likelihood_difference_stderr": 0.4062917141669697,
      "pct_stereotype": 0.48947368421052634,
      "pct_stereotype_stderr": 0.036361587723547695
    },
    "hendrycksTest-logical_fallacies": {
      "acc": 0.22085889570552147,
      "acc_stderr": 0.032591773927421776,
      "acc_norm": 0.3128834355828221,
      "acc_norm_stderr": 0.036429145782924055
    },
    "hendrycksTest-astronomy": {
      "acc": 0.20394736842105263,
      "acc_stderr": 0.032790004063100495,
      "acc_norm": 0.27631578947368424,
      "acc_norm_stderr": 0.03639057569952925
    },
    "crows_pairs_english_autre": {
      "likelihood_difference": 6.349431818181818,
      "likelihood_difference_stderr": 2.804745680840638,
      "pct_stereotype": 0.45454545454545453,
      "pct_stereotype_stderr": 0.15745916432444335
    },
    "hendrycksTest-high_school_world_history": {
      "acc": 0.17721518987341772,
      "acc_stderr": 0.02485636418450322,
      "acc_norm": 0.25738396624472576,
      "acc_norm_stderr": 0.028458820991460295
    },
    "hendrycksTest-professional_medicine": {
      "acc": 0.25,
      "acc_stderr": 0.026303648393696036,
      "acc_norm": 0.25,
      "acc_norm_stderr": 0.026303648393696036
    },
    "hendrycksTest-college_computer_science": {
      "acc": 0.25,
      "acc_stderr": 0.04351941398892446,
      "acc_norm": 0.21,
      "acc_norm_stderr": 0.040936018074033256
    },
    "lambada_openai": {
      "ppl": 705314.6370389248,
      "ppl_stderr": 50610.68705557734,
      "acc": 0.0,
      "acc_stderr": 0.0
    },
    "hendrycksTest-college_medicine": {
      "acc": 0.23699421965317918,
      "acc_stderr": 0.03242414757483098,
      "acc_norm": 0.2658959537572254,
      "acc_norm_stderr": 0.03368762932259432
    },
    "arc_easy": {
      "acc": 0.27441077441077444,
      "acc_stderr": 0.00915617712224453,
      "acc_norm": 0.2849326599326599,
      "acc_norm_stderr": 0.009262170695590658
    },
    "hendrycksTest-security_studies": {
      "acc": 0.3306122448979592,
      "acc_stderr": 0.030116426296540613,
      "acc_norm": 0.20408163265306123,
      "acc_norm_stderr": 0.025801283475090506
    },
    "winogrande": {
      "acc": 0.4925019731649566,
      "acc_stderr": 0.014050905521228577
    },
    "crows_pairs_english_nationality": {
      "likelihood_difference": 5.4428530092592595,
      "likelihood_difference_stderr": 0.3840752204417463,
      "pct_stereotype": 0.3333333333333333,
      "pct_stereotype_stderr": 0.03214952147802749
    },
    "arc_challenge": {
      "acc": 0.19965870307167236,
      "acc_stderr": 0.011681625756888669,
      "acc_norm": 0.24146757679180889,
      "acc_norm_stderr": 0.01250656483973943
    },
    "hendrycksTest-computer_security": {
      "acc": 0.22,
      "acc_stderr": 0.04163331998932268,
      "acc_norm": 0.27,
      "acc_norm_stderr": 0.044619604333847394
    },
    "hendrycksTest-world_religions": {
      "acc": 0.1695906432748538,
      "acc_stderr": 0.028782108105401712,
      "acc_norm": 0.25146198830409355,
      "acc_norm_stderr": 0.033275044238468436
    },
    "crows_pairs_french_age": {
      "likelihood_difference": 4.167361111111111,
      "likelihood_difference_stderr": 0.49130810000225555,
      "pct_stereotype": 0.4111111111111111,
      "pct_stereotype_stderr": 0.052155640611075534
    },
    "hendrycksTest-elementary_mathematics": {
      "acc": 0.2275132275132275,
      "acc_stderr": 0.021591269407823778,
      "acc_norm": 0.21164021164021163,
      "acc_norm_stderr": 0.02103733150526289
    },
    "hendrycksTest-international_law": {
      "acc": 0.10743801652892562,
      "acc_stderr": 0.02826881219254063,
      "acc_norm": 0.2396694214876033,
      "acc_norm_stderr": 0.03896878985070417
    },
    "crows_pairs_french_disability": {
      "likelihood_difference": 10.162878787878787,
      "likelihood_difference_stderr": 1.04556369991972,
      "pct_stereotype": 0.3333333333333333,
      "pct_stereotype_stderr": 0.0584705346204686
    },
    "hendrycksTest-miscellaneous": {
      "acc": 0.23627075351213284,
      "acc_stderr": 0.015190473717037498,
      "acc_norm": 0.25287356321839083,
      "acc_norm_stderr": 0.015543377313719681
    },
    "hendrycksTest-high_school_european_history": {
      "acc": 0.16363636363636364,
      "acc_stderr": 0.028887872395487953,
      "acc_norm": 0.24242424242424243,
      "acc_norm_stderr": 0.03346409881055953
    },
    "crows_pairs_french_religion": {
      "likelihood_difference": 7.765760869565217,
      "likelihood_difference_stderr": 0.49195584086877725,
      "pct_stereotype": 0.6869565217391305,
      "pct_stereotype_stderr": 0.043432470166108225
    },
    "hendrycksTest-professional_accounting": {
      "acc": 0.25886524822695034,
      "acc_stderr": 0.026129572527180848,
      "acc_norm": 0.2730496453900709,
      "acc_norm_stderr": 0.02657786094330786
    },
    "hendrycksTest-high_school_geography": {
      "acc": 0.18686868686868688,
      "acc_stderr": 0.027772533334218977,
      "acc_norm": 0.30303030303030304,
      "acc_norm_stderr": 0.032742879140268674
    },
    "hendrycksTest-anatomy": {
      "acc": 0.2074074074074074,
      "acc_stderr": 0.03502553170678319,
      "acc_norm": 0.25925925925925924,
      "acc_norm_stderr": 0.03785714465066653
    },
    "hendrycksTest-philosophy": {
      "acc": 0.2379421221864952,
      "acc_stderr": 0.02418515064781871,
      "acc_norm": 0.2990353697749196,
      "acc_norm_stderr": 0.02600330111788513
    },
    "crows_pairs_english_race_color": {
      "likelihood_difference": 4.281742125984252,
      "likelihood_difference_stderr": 0.21780058915583433,
      "pct_stereotype": 0.3838582677165354,
      "pct_stereotype_stderr": 0.021598410071068296
    },
    "hendrycksTest-high_school_government_and_politics": {
      "acc": 0.19689119170984457,
      "acc_stderr": 0.028697873971860674,
      "acc_norm": 0.2538860103626943,
      "acc_norm_stderr": 0.03141024780565318
    },
    "hendrycksTest-high_school_physics": {
      "acc": 0.2052980132450331,
      "acc_stderr": 0.03297986648473836,
      "acc_norm": 0.24503311258278146,
      "acc_norm_stderr": 0.035118075718047245
    },
    "crows_pairs_french_socioeconomic": {
      "likelihood_difference": 7.983976403061225,
      "likelihood_difference_stderr": 0.545579868210259,
      "pct_stereotype": 0.34183673469387754,
      "pct_stereotype_stderr": 0.033967132039868675
    },
    "hendrycksTest-high_school_macroeconomics": {
      "acc": 0.19743589743589743,
      "acc_stderr": 0.02018264696867484,
      "acc_norm": 0.22564102564102564,
      "acc_norm_stderr": 0.02119363252514852
    },
    "hendrycksTest-human_sexuality": {
      "acc": 0.29770992366412213,
      "acc_stderr": 0.04010358942462203,
      "acc_norm": 0.2824427480916031,
      "acc_norm_stderr": 0.03948406125768361
    },
    "hendrycksTest-electrical_engineering": {
      "acc": 0.25517241379310346,
      "acc_stderr": 0.03632984052707842,
      "acc_norm": 0.2689655172413793,
      "acc_norm_stderr": 0.036951833116502325
    },
    "hendrycksTest-us_foreign_policy": {
      "acc": 0.23,
      "acc_stderr": 0.04229525846816505,
      "acc_norm": 0.24,
      "acc_norm_stderr": 0.042923469599092816
    },
    "crows_pairs_french_race_color": {
      "likelihood_difference": 5.3552989130434785,
      "likelihood_difference_stderr": 0.2271004698936648,
      "pct_stereotype": 0.6869565217391305,
      "pct_stereotype_stderr": 0.021645150653106047
    },
    "piqa": {
      "acc": 0.5179542981501633,
      "acc_stderr": 0.011658300623287153,
      "acc_norm": 0.515778019586507,
      "acc_norm_stderr": 0.011660014400426182
    },
    "hendrycksTest-virology": {
      "acc": 0.22289156626506024,
      "acc_stderr": 0.03240004825594688,
      "acc_norm": 0.25301204819277107,
      "acc_norm_stderr": 0.03384429155233137
    },
    "hendrycksTest-college_mathematics": {
      "acc": 0.16,
      "acc_stderr": 0.03684529491774708,
      "acc_norm": 0.17,
      "acc_norm_stderr": 0.0377525168068637
    },
    "hendrycksTest-high_school_biology": {
      "acc": 0.2129032258064516,
      "acc_stderr": 0.02328766512726853,
      "acc_norm": 0.23870967741935484,
      "acc_norm_stderr": 0.024251071262208837
    },
    "hendrycksTest-professional_law": {
      "acc": 0.242503259452412,
      "acc_stderr": 0.010946570966348783,
      "acc_norm": 0.2711864406779661,
      "acc_norm_stderr": 0.011354581451622986
    },
    "hendrycksTest-college_physics": {
      "acc": 0.20588235294117646,
      "acc_stderr": 0.04023382273617747,
      "acc_norm": 0.19607843137254902,
      "acc_norm_stderr": 0.03950581861179962
    },
    "hendrycksTest-nutrition": {
      "acc": 0.19607843137254902,
      "acc_stderr": 0.022733789405447593,
      "acc_norm": 0.28431372549019607,
      "acc_norm_stderr": 0.025829163272757482
    },
    "hendrycksTest-business_ethics": {
      "acc": 0.32,
      "acc_stderr": 0.04688261722621505,
      "acc_norm": 0.29,
      "acc_norm_stderr": 0.04560480215720684
    },
    "crows_pairs_french_physical_appearance": {
      "likelihood_difference": 7.185329861111111,
      "likelihood_difference_stderr": 0.9560662240150144,
      "pct_stereotype": 0.5416666666666666,
      "pct_stereotype_stderr": 0.05913268547421809
    }
  },
  "versions": {
    "crows_pairs_french_gender": 0,
    "hendrycksTest-marketing": 0,
    "hendrycksTest-high_school_psychology": 0,
    "hendrycksTest-college_chemistry": 0,
    "hendrycksTest-abstract_algebra": 0,
    "hendrycksTest-high_school_chemistry": 0,
    "hendrycksTest-econometrics": 0,
    "crows_pairs_english_disability": 0,
    "hendrycksTest-high_school_computer_science": 0,
    "crows_pairs_french_nationality": 0,
    "hendrycksTest-public_relations": 0,
    "crows_pairs_english_age": 0,
    "logiqa": 0,
    "hendrycksTest-human_aging": 0,
    "hendrycksTest-formal_logic": 0,
    "hendrycksTest-clinical_knowledge": 0,
    "crows_pairs_french_autre": 0,
    "hendrycksTest-moral_disputes": 0,
    "crows_pairs_english_sexual_orientation": 0,
    "hendrycksTest-professional_psychology": 0,
    "hendrycksTest-high_school_microeconomics": 0,
    "hendrycksTest-high_school_statistics": 0,
    "crows_pairs_english_gender": 0,
    "wsc": 0,
    "hendrycksTest-high_school_us_history": 0,
    "crows_pairs_english_religion": 0,
    "sciq": 0,
    "crows_pairs_english_physical_appearance": 0,
    "hendrycksTest-machine_learning": 0,
    "hendrycksTest-prehistory": 0,
    "hendrycksTest-sociology": 0,
    "hendrycksTest-global_facts": 0,
    "crows_pairs_french": 0,
    "hendrycksTest-medical_genetics": 0,
    "hendrycksTest-high_school_mathematics": 0,
    "hendrycksTest-college_biology": 0,
    "hendrycksTest-conceptual_physics": 0,
    "hendrycksTest-moral_scenarios": 0,
    "hendrycksTest-jurisprudence": 0,
    "crows_pairs_english": 0,
    "crows_pairs_french_sexual_orientation": 0,
    "hendrycksTest-management": 0,
    "crows_pairs_english_socioeconomic": 0,
    "hendrycksTest-logical_fallacies": 0,
    "hendrycksTest-astronomy": 0,
    "crows_pairs_english_autre": 0,
    "hendrycksTest-high_school_world_history": 0,
    "hendrycksTest-professional_medicine": 0,
    "hendrycksTest-college_computer_science": 0,
    "lambada_openai": 0,
    "hendrycksTest-college_medicine": 0,
    "arc_easy": 0,
    "hendrycksTest-security_studies": 0,
    "winogrande": 0,
    "crows_pairs_english_nationality": 0,
    "arc_challenge": 0,
    "hendrycksTest-computer_security": 0,
    "hendrycksTest-world_religions": 0,
    "crows_pairs_french_age": 0,
    "hendrycksTest-elementary_mathematics": 0,
    "hendrycksTest-international_law": 0,
    "crows_pairs_french_disability": 0,
    "hendrycksTest-miscellaneous": 0,
    "hendrycksTest-high_school_european_history": 0,
    "crows_pairs_french_religion": 0,
    "hendrycksTest-professional_accounting": 0,
    "hendrycksTest-high_school_geography": 0,
    "hendrycksTest-anatomy": 0,
    "hendrycksTest-philosophy": 0,
    "crows_pairs_english_race_color": 0,
    "hendrycksTest-high_school_government_and_politics": 0,
    "hendrycksTest-high_school_physics": 0,
    "crows_pairs_french_socioeconomic": 0,
    "hendrycksTest-high_school_macroeconomics": 0,
    "hendrycksTest-human_sexuality": 0,
    "hendrycksTest-electrical_engineering": 0,
    "hendrycksTest-us_foreign_policy": 0,
    "crows_pairs_french_race_color": 0,
    "piqa": 0,
    "hendrycksTest-virology": 0,
    "hendrycksTest-college_mathematics": 0,
    "hendrycksTest-high_school_biology": 0,
    "hendrycksTest-professional_law": 0,
    "hendrycksTest-college_physics": 0,
    "hendrycksTest-nutrition": 0,
    "hendrycksTest-business_ethics": 0,
    "crows_pairs_french_physical_appearance": 0
  },
  "config": {
    "model": "hf-causal",
    "model_args": "use_accelerate=True,pretrained=EleutherAI/pythia-v1.1-70m,revision=step256",
    "num_fewshot": 0,
    "batch_size": 32,
    "device": null,
    "no_cache": true,
    "limit": null,
    "bootstrap_iters": 100000,
    "description_dict": {}
  }
}