Prompt_Squirrel_RAG / data /analysis /model_ab_n30_compare_20260321.json
Food Desert
Update docs diagrams, evaluation labels, and analysis artifacts
6566a4f
Raw
History Blame
18.4 kB
{
"rows": [
{
"model": "meta-llama/llama-3.1-8b-instruct",
"price_prompt_per_token": 2e-08,
"price_completion_per_token": 5e-08,
"price_total_per_token": 6.999999999999999e-08,
"price_ratio_vs_baseline": 1.0,
"json_valid_rate_selection": 1.0,
"json_parse_fail_rate_selection": 0.0,
"selection_attempt_error_rate": 0.0,
"selection_call_exhaust_rate": 0.06451612903225806,
"macro_precision": 0.48757333333333336,
"macro_recall": 0.68523,
"macro_f1": 0.5531033333333333,
"micro_precision": 0.4859038142620232,
"micro_recall": 0.6845794392523364,
"micro_f1": 0.5683802133850631,
"hallucinated_avg_extra_tags": 10.333333333333334,
"hallucinated_rate_fp_over_selected": 0.5140961857379768,
"latency_avg_total_s": 16.39666666666667,
"latency_p95_total_s": 28.11,
"weird_spurious_character_samples": 1,
"weird_rare_extra_ratio": 0.22580645161290322,
"top_extra_tags": [
[
"clothed",
18
],
[
"clothing",
15
],
[
"anthro",
12
],
[
"looking_at_viewer",
11
],
[
"eyes",
8
],
[
"male",
5
],
[
"text",
5
],
[
"bottomwear",
3
],
[
"holding_object",
3
],
[
"nose",
3
],
[
"pose",
3
],
[
"topless",
3
]
],
"n_errors": 0
},
{
"model": "mistralai/mistral-nemo",
"price_prompt_per_token": 2e-08,
"price_completion_per_token": 4e-08,
"price_total_per_token": 6.000000000000001e-08,
"price_ratio_vs_baseline": 0.8571428571428573,
"json_valid_rate_selection": 1.0,
"json_parse_fail_rate_selection": 0.0,
"selection_attempt_error_rate": 0.0,
"selection_call_exhaust_rate": 0.22580645161290322,
"macro_precision": 0.5021633333333333,
"macro_recall": 0.4754366666666667,
"macro_f1": 0.4667133333333333,
"micro_precision": 0.5153846153846153,
"micro_recall": 0.4696261682242991,
"micro_f1": 0.49144254278728605,
"hallucinated_avg_extra_tags": 6.3,
"hallucinated_rate_fp_over_selected": 0.4846153846153846,
"latency_avg_total_s": 16.548666666666666,
"latency_p95_total_s": 26.270000000000003,
"weird_spurious_character_samples": 3,
"weird_rare_extra_ratio": 0.13227513227513227,
"top_extra_tags": [
[
"clothed",
21
],
[
"clothing",
17
],
[
"anthro",
14
],
[
"looking_at_viewer",
11
],
[
"female",
8
],
[
"male",
6
],
[
"pose",
5
],
[
"text",
5
],
[
"playful",
3
],
[
"ambiguous_gender",
2
],
[
"eyes",
2
],
[
"humanoid",
2
]
],
"n_errors": 0
},
{
"model": "qwen/qwen-2.5-7b-instruct",
"price_prompt_per_token": 4e-08,
"price_completion_per_token": 1e-07,
"price_total_per_token": 1.3999999999999998e-07,
"price_ratio_vs_baseline": 2.0,
"json_valid_rate_selection": 1.0,
"json_parse_fail_rate_selection": 0.0,
"selection_attempt_error_rate": 0.09090909090909091,
"selection_call_exhaust_rate": 0.03225806451612903,
"macro_precision": 0.5090266666666666,
"macro_recall": 0.5072733333333334,
"macro_f1": 0.4856666666666667,
"micro_precision": 0.5071090047393365,
"micro_recall": 0.5,
"micro_f1": 0.5035294117647059,
"hallucinated_avg_extra_tags": 6.933333333333334,
"hallucinated_rate_fp_over_selected": 0.4928909952606635,
"latency_avg_total_s": 18.443333333333335,
"latency_p95_total_s": 33.75,
"weird_spurious_character_samples": 1,
"weird_rare_extra_ratio": 0.22596153846153846,
"top_extra_tags": [
[
"looking_at_viewer",
16
],
[
"clothed",
12
],
[
"anthro",
9
],
[
"clothing",
9
],
[
"text",
6
],
[
"eyes",
3
],
[
"feral",
3
],
[
"playful",
3
],
[
"pose",
3
],
[
"black_clothing",
2
],
[
"dark_background",
2
],
[
"excited",
2
]
],
"n_errors": 0
},
{
"model": "qwen/qwen2.5-coder-7b-instruct",
"price_prompt_per_token": 3e-08,
"price_completion_per_token": 9e-08,
"price_total_per_token": 1.2e-07,
"price_ratio_vs_baseline": 1.7142857142857144,
"json_valid_rate_selection": 1.0,
"json_parse_fail_rate_selection": 0.0,
"selection_attempt_error_rate": 0.0,
"selection_call_exhaust_rate": 0.0,
"macro_precision": 0.3461066666666667,
"macro_recall": 0.5469366666666666,
"macro_f1": 0.40475666666666665,
"micro_precision": 0.3219178082191781,
"micro_recall": 0.5490654205607477,
"micro_f1": 0.4058721934369603,
"hallucinated_avg_extra_tags": 16.5,
"hallucinated_rate_fp_over_selected": 0.678082191780822,
"latency_avg_total_s": 14.717333333333332,
"latency_p95_total_s": 24.189999999999998,
"weird_spurious_character_samples": 10,
"weird_rare_extra_ratio": 0.2080808080808081,
"top_extra_tags": [
[
"male",
23
],
[
"clothed",
22
],
[
"clothing",
18
],
[
"anthro",
14
],
[
"eyes",
8
],
[
"looking_at_viewer",
8
],
[
"duo",
7
],
[
"pose",
6
],
[
"canis",
5
],
[
"holding_object",
5
],
[
"wolf",
5
],
[
"background_character",
4
]
],
"n_errors": 0
},
{
"model": "mistralai/mistral-small-3.1-24b-instruct",
"price_prompt_per_token": 3e-08,
"price_completion_per_token": 1.1e-07,
"price_total_per_token": 1.4e-07,
"price_ratio_vs_baseline": 2.0000000000000004,
"json_valid_rate_selection": 1.0,
"json_parse_fail_rate_selection": 0.0,
"selection_attempt_error_rate": 0.0,
"selection_call_exhaust_rate": 0.0,
"macro_precision": 0.5346866666666666,
"macro_recall": 0.80151,
"macro_f1": 0.62949,
"micro_precision": 0.5291411042944786,
"micro_recall": 0.8060747663551402,
"micro_f1": 0.638888888888889,
"hallucinated_avg_extra_tags": 10.233333333333333,
"hallucinated_rate_fp_over_selected": 0.4708588957055215,
"latency_avg_total_s": 21.778666666666666,
"latency_p95_total_s": 31.800000000000004,
"weird_spurious_character_samples": 0,
"weird_rare_extra_ratio": 0.2280130293159609,
"top_extra_tags": [
[
"anthro",
11
],
[
"clothed",
11
],
[
"eyes",
10
],
[
"pose",
8
],
[
"ambiguous_gender",
7
],
[
"clothing",
7
],
[
"text",
5
],
[
"holding_object",
4
],
[
"humanoid",
4
],
[
"playful",
4
],
[
"action_pose",
3
],
[
"feral",
3
]
],
"n_errors": 0
}
],
"important_tags": [
"anthro",
"clothed",
"clothing",
"canid",
"felid",
"bear",
"looking_at_viewer",
"solo"
],
"per_tag": {
"meta-llama/llama-3.1-8b-instruct": {
"anthro": {
"support_gt": 16,
"predicted": 28,
"tp": 16,
"fp": 12,
"fn": 0,
"precision": 0.5714285714285714,
"recall": 1.0,
"f1": 0.7272727272727273
},
"clothed": {
"support_gt": 8,
"predicted": 26,
"tp": 8,
"fp": 18,
"fn": 0,
"precision": 0.3076923076923077,
"recall": 1.0,
"f1": 0.47058823529411764
},
"clothing": {
"support_gt": 12,
"predicted": 27,
"tp": 12,
"fp": 15,
"fn": 0,
"precision": 0.4444444444444444,
"recall": 1.0,
"f1": 0.6153846153846153
},
"canid": {
"support_gt": 9,
"predicted": 9,
"tp": 9,
"fp": 0,
"fn": 0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0
},
"felid": {
"support_gt": 2,
"predicted": 2,
"tp": 2,
"fp": 0,
"fn": 0,
"precision": 1.0,
"recall": 1.0,
"f1": 1.0
},
"bear": {
"support_gt": 3,
"predicted": 1,
"tp": 1,
"fp": 0,
"fn": 2,
"precision": 1.0,
"recall": 0.3333333333333333,
"f1": 0.5
},
"looking_at_viewer": {
"support_gt": 2,
"predicted": 13,
"tp": 2,
"fp": 11,
"fn": 0,
"precision": 0.15384615384615385,
"recall": 1.0,
"f1": 0.2666666666666667
},
"solo": {
"support_gt": 23,
"predicted": 25,
"tp": 23,
"fp": 2,
"fn": 0,
"precision": 0.92,
"recall": 1.0,
"f1": 0.9583333333333334
}
},
"mistralai/mistral-nemo": {
"anthro": {
"support_gt": 16,
"predicted": 30,
"tp": 16,
"fp": 14,
"fn": 0,
"precision": 0.5333333333333333,
"recall": 1.0,
"f1": 0.6956521739130436
},
"clothed": {
"support_gt": 8,
"predicted": 29,
"tp": 8,
"fp": 21,
"fn": 0,
"precision": 0.27586206896551724,
"recall": 1.0,
"f1": 0.4324324324324324
},
"clothing": {
"support_gt": 12,
"predicted": 29,
"tp": 12,
"fp": 17,
"fn": 0,
"precision": 0.41379310344827586,
"recall": 1.0,
"f1": 0.5853658536585366
},
"canid": {
"support_gt": 9,
"predicted": 5,
"tp": 4,
"fp": 1,
"fn": 5,
"precision": 0.8,
"recall": 0.4444444444444444,
"f1": 0.5714285714285714
},
"felid": {
"support_gt": 2,
"predicted": 1,
"tp": 1,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.5,
"f1": 0.6666666666666666
},
"bear": {
"support_gt": 3,
"predicted": 2,
"tp": 2,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.6666666666666666,
"f1": 0.8
},
"looking_at_viewer": {
"support_gt": 2,
"predicted": 13,
"tp": 2,
"fp": 11,
"fn": 0,
"precision": 0.15384615384615385,
"recall": 1.0,
"f1": 0.2666666666666667
},
"solo": {
"support_gt": 23,
"predicted": 25,
"tp": 23,
"fp": 2,
"fn": 0,
"precision": 0.92,
"recall": 1.0,
"f1": 0.9583333333333334
}
},
"qwen/qwen-2.5-7b-instruct": {
"anthro": {
"support_gt": 16,
"predicted": 25,
"tp": 16,
"fp": 9,
"fn": 0,
"precision": 0.64,
"recall": 1.0,
"f1": 0.7804878048780487
},
"clothed": {
"support_gt": 8,
"predicted": 19,
"tp": 7,
"fp": 12,
"fn": 1,
"precision": 0.3684210526315789,
"recall": 0.875,
"f1": 0.5185185185185185
},
"clothing": {
"support_gt": 12,
"predicted": 21,
"tp": 12,
"fp": 9,
"fn": 0,
"precision": 0.5714285714285714,
"recall": 1.0,
"f1": 0.7272727272727273
},
"canid": {
"support_gt": 9,
"predicted": 6,
"tp": 5,
"fp": 1,
"fn": 4,
"precision": 0.8333333333333334,
"recall": 0.5555555555555556,
"f1": 0.6666666666666667
},
"felid": {
"support_gt": 2,
"predicted": 3,
"tp": 2,
"fp": 1,
"fn": 0,
"precision": 0.6666666666666666,
"recall": 1.0,
"f1": 0.8
},
"bear": {
"support_gt": 3,
"predicted": 1,
"tp": 1,
"fp": 0,
"fn": 2,
"precision": 1.0,
"recall": 0.3333333333333333,
"f1": 0.5
},
"looking_at_viewer": {
"support_gt": 2,
"predicted": 18,
"tp": 2,
"fp": 16,
"fn": 0,
"precision": 0.1111111111111111,
"recall": 1.0,
"f1": 0.19999999999999998
},
"solo": {
"support_gt": 23,
"predicted": 22,
"tp": 20,
"fp": 2,
"fn": 3,
"precision": 0.9090909090909091,
"recall": 0.8695652173913043,
"f1": 0.888888888888889
}
},
"qwen/qwen2.5-coder-7b-instruct": {
"anthro": {
"support_gt": 16,
"predicted": 30,
"tp": 16,
"fp": 14,
"fn": 0,
"precision": 0.5333333333333333,
"recall": 1.0,
"f1": 0.6956521739130436
},
"clothed": {
"support_gt": 8,
"predicted": 30,
"tp": 8,
"fp": 22,
"fn": 0,
"precision": 0.26666666666666666,
"recall": 1.0,
"f1": 0.4210526315789474
},
"clothing": {
"support_gt": 12,
"predicted": 30,
"tp": 12,
"fp": 18,
"fn": 0,
"precision": 0.4,
"recall": 1.0,
"f1": 0.5714285714285715
},
"canid": {
"support_gt": 9,
"predicted": 10,
"tp": 6,
"fp": 4,
"fn": 3,
"precision": 0.6,
"recall": 0.6666666666666666,
"f1": 0.631578947368421
},
"felid": {
"support_gt": 2,
"predicted": 1,
"tp": 1,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.5,
"f1": 0.6666666666666666
},
"bear": {
"support_gt": 3,
"predicted": 2,
"tp": 2,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.6666666666666666,
"f1": 0.8
},
"looking_at_viewer": {
"support_gt": 2,
"predicted": 9,
"tp": 1,
"fp": 8,
"fn": 1,
"precision": 0.1111111111111111,
"recall": 0.5,
"f1": 0.1818181818181818
},
"solo": {
"support_gt": 23,
"predicted": 26,
"tp": 23,
"fp": 3,
"fn": 0,
"precision": 0.8846153846153846,
"recall": 1.0,
"f1": 0.9387755102040816
}
},
"mistralai/mistral-small-3.1-24b-instruct": {
"anthro": {
"support_gt": 16,
"predicted": 27,
"tp": 16,
"fp": 11,
"fn": 0,
"precision": 0.5925925925925926,
"recall": 1.0,
"f1": 0.7441860465116279
},
"clothed": {
"support_gt": 8,
"predicted": 19,
"tp": 8,
"fp": 11,
"fn": 0,
"precision": 0.42105263157894735,
"recall": 1.0,
"f1": 0.5925925925925926
},
"clothing": {
"support_gt": 12,
"predicted": 19,
"tp": 12,
"fp": 7,
"fn": 0,
"precision": 0.631578947368421,
"recall": 1.0,
"f1": 0.7741935483870968
},
"canid": {
"support_gt": 9,
"predicted": 10,
"tp": 9,
"fp": 1,
"fn": 0,
"precision": 0.9,
"recall": 1.0,
"f1": 0.9473684210526316
},
"felid": {
"support_gt": 2,
"predicted": 1,
"tp": 1,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.5,
"f1": 0.6666666666666666
},
"bear": {
"support_gt": 3,
"predicted": 2,
"tp": 2,
"fp": 0,
"fn": 1,
"precision": 1.0,
"recall": 0.6666666666666666,
"f1": 0.8
},
"looking_at_viewer": {
"support_gt": 2,
"predicted": 3,
"tp": 2,
"fp": 1,
"fn": 0,
"precision": 0.6666666666666666,
"recall": 1.0,
"f1": 0.8
},
"solo": {
"support_gt": 23,
"predicted": 25,
"tp": 23,
"fp": 2,
"fn": 0,
"precision": 0.92,
"recall": 1.0,
"f1": 0.9583333333333334
}
}
}
}