// DDR-Bench Visualization Data // Auto-generated by export_web_data.py - matches Python plotting scripts exactly const DDR_DATA = { modelColors: { "GPT-5.2": "#00C853", "Claude-4.5-Sonnet": "#FF6D00", "Gemini-3-Flash": "#2196F3", "GLM-4.6": "#9C27B0", "DeepSeek-V3": "#E91E63" }, scaling: { "mimic": { "GPT-5.2": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55 ], "tokens": [ 3737, 8577, 15459, 20193, 24028, 26951, 28820, 29751, 30405, 30968, 31260 ], "costs": [ 0.005, 0.0207, 0.0516, 0.0947, 0.1522, 0.2153, 0.2799, 0.3597, 0.4373, 0.4906, 0.635 ], "accuracy": [ 10.85, 15.25, 18.35, 20.41, 23.26, 24.42, 25.32, 25.97, 26.36, 26.87, 27.26 ] }, "Gemini-3-Flash": { "turns": [ 5, 10, 15, 20, 25 ], "tokens": [ 5580, 14305, 23357, 26964, 27542 ], "costs": [ 0.002, 0.008, 0.0173, 0.0284, 0.045 ], "accuracy": [ 7.62, 13.44, 19.77, 24.03, 24.94 ] }, "Claude-4.5-Sonnet": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75 ], "tokens": [ 4513, 9214, 13378, 17217, 20275, 22878, 25379, 27559, 29532, 31239, 32395, 33382, 33796, 33968, 34140 ], "costs": [ 0.0152, 0.059, 0.1249, 0.2138, 0.3214, 0.4458, 0.5823, 0.7212, 0.842, 0.9656, 1.0851, 1.1605, 1.3008, 1.4081, 1.3369 ], "accuracy": [ 8.14, 9.17, 11.89, 14.73, 16.67, 18.22, 19.77, 22.87, 26.61, 29.46, 31.78, 33.59, 33.98, 34.24, 34.37 ] }, "GLM-4.6": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95 ], "tokens": [ 3488, 7059, 10542, 13099, 14972, 16484, 17524, 18410, 19112, 19728, 20259, 20715, 21135, 21489, 21858, 22169, 22422, 22613, 22802 ], "costs": [ 0.0026, 0.0097, 0.0217, 0.0369, 0.0552, 0.0743, 0.0969, 0.1204, 0.1489, 0.1769, 0.2074, 0.24, 0.2763, 0.3114, 0.3522, 0.3935, 0.4408, 0.4741, 0.5461 ], "accuracy": [ 9.43, 11.11, 13.57, 16.02, 17.57, 18.86, 19.77, 20.16, 20.8, 21.19, 21.45, 22.09, 22.48, 22.61, 22.87, 23.13, 23.13, 23.13, 23.26 ] }, "deepseek-v3.2": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45 ], "tokens": [ 3447, 9155, 13730, 17912, 21270, 23962, 26205, 27253, 27411 ], "costs": [ 0.001, 0.0048, 0.011, 0.0192, 0.0282, 0.0385, 0.0499, 0.0642, 0.0694 ], "accuracy": [ 9.3, 12.53, 14.73, 17.05, 20.16, 23.9, 25.97, 26.87, 27.0 ] } }, "10k": { "Claude-4.5-Sonnet": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80 ], "tokens": [ 2561, 7208, 10978, 14050, 17080, 19955, 22501, 25124, 27696, 30109, 32363, 34247, 35154, 35907, 36148, 36277 ], "costs": [ 0.0094, 0.0414, 0.0955, 0.1682, 0.2576, 0.3598, 0.4751, 0.5932, 0.7209, 0.8684, 1.0029, 1.0913, 1.2015, 1.3713, 1.4854, 1.5611 ], "accuracy": [ 0.82, 1.06, 1.41, 3.18, 5.65, 9.78, 15.19, 22.85, 31.8, 46.64, 60.42, 69.02, 73.26, 75.62, 76.68, 77.27 ] }, "GPT-5.2": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60 ], "tokens": [ 2823, 7902, 11581, 14651, 16306, 17356, 17871, 18251, 18345, 18398, 18441, 18468 ], "costs": [ 0.0037, 0.0199, 0.0454, 0.0774, 0.1125, 0.1524, 0.1983, 0.2657, 0.3601, 0.4706, 0.5641, 0.6699 ], "accuracy": [ 0.82, 8.36, 22.85, 32.98, 37.57, 40.52, 43.23, 44.29, 44.41, 44.52, 44.76, 44.99 ] }, "GLM-4.6": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90 ], "tokens": [ 1838, 3927, 6056, 8124, 10227, 12399, 14608, 16757, 18614, 20290, 21537, 22418, 23164, 23553, 23781, 23950, 24062, 24102 ], "costs": [ 0.0013, 0.0053, 0.012, 0.0214, 0.0334, 0.0481, 0.0654, 0.0855, 0.1047, 0.1277, 0.1498, 0.1724, 0.2004, 0.223, 0.2716, 0.3281, 0.3281, 0.4018 ], "accuracy": [ 0.24, 0.59, 2.0, 4.48, 8.72, 13.19, 19.08, 26.27, 35.34, 41.22, 47.7, 52.3, 54.77, 56.54, 57.83, 59.25, 60.19, 60.42 ] }, "deepseek-v3.2": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60 ], "tokens": [ 1997, 4260, 6707, 9238, 11703, 14313, 16306, 18027, 19074, 19698, 19875, 19988 ], "costs": [ 0.0006, 0.0024, 0.0054, 0.0097, 0.0153, 0.0214, 0.028, 0.0355, 0.0437, 0.0505, 0.06, 0.0694 ], "accuracy": [ 1.18, 1.88, 2.83, 6.12, 14.13, 27.09, 37.34, 48.53, 55.48, 58.54, 59.72, 60.42 ] }, "Gemini-3-Flash": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40 ], "tokens": [ 4473, 12616, 18404, 20077, 20729, 20883, 20935, 21004 ], "costs": [ 0.002, 0.008, 0.0171, 0.0275, 0.0385, 0.0405, 0.0538, 0.0688 ], "accuracy": [ 1.88, 18.61, 38.4, 42.17, 43.7, 44.05, 44.05, 44.41 ] } }, "globem": { "deepseek-v3.2": { "turns": [ 5, 10, 15, 20, 25 ], "tokens": [ 3972, 10497, 20470, 32293, 36396 ], "costs": [ 0.001, 0.005, 0.0135, 0.0262, 0.0382 ], "accuracy": [ 0.92, 0.92, 4.6, 27.59, 36.78 ] }, "GLM-4.6": { "turns": [ 5, 10, 15, 20, 25, 30, 35 ], "tokens": [ 3851, 8817, 16221, 26186, 32076, 34058, 34425 ], "costs": [ 0.0027, 0.0115, 0.028, 0.0544, 0.0891, 0.1409, 0.1581 ], "accuracy": [ 2.3, 2.3, 5.52, 22.99, 36.32, 40.0, 41.61 ] }, "Gemini-3-Flash": { "turns": [ 5, 10, 15, 20, 25, 30, 35, 40 ], "tokens": [ 6260, 14825, 25972, 35526, 40312, 41787, 42167, 42236 ], "costs": [ 0.0021, 0.0074, 0.0166, 0.0292, 0.047, 0.0828, 0.1479, 0.2258 ], "accuracy": [ 1.88, 2.12, 5.88, 21.41, 30.35, 34.35, 35.06, 35.29 ] }, "Claude-4.5-Sonnet": { "turns": [ 5, 10, 15, 20, 25, 30 ], "tokens": [ 4579, 11405, 21188, 32526, 44888, 49137 ], "costs": [ 0.0152, 0.0638, 0.1611, 0.3124, 0.4877, 0.6335 ], "accuracy": [ 2.53, 2.53, 3.45, 12.64, 30.57, 40.23 ] }, "GPT-5.2": { "turns": [ 5, 10, 15, 20 ], "tokens": [ 3477, 10218, 17672, 19878 ], "costs": [ 0.0048, 0.0236, 0.0652, 0.1238 ], "accuracy": [ 0.92, 5.98, 34.02, 38.39 ] } } }, ranking: { "MIMIC": [ { "model": "gpt5-mini", "bt_rank": 1, "win_rate": 100.0, "accuracy": 27.59, "acc_rank": 7, "is_proprietary": true }, { "model": "claude4.5-sonnet", "bt_rank": 2, "win_rate": 94.6, "accuracy": 33.66, "acc_rank": 1, "is_proprietary": true }, { "model": "gpt5mini", "bt_rank": 3, "win_rate": 87.8, "accuracy": 27.59, "acc_rank": 8, "is_proprietary": true }, { "model": "gpt5.2", "bt_rank": 4, "win_rate": 83.6, "accuracy": 28.88, "acc_rank": 5, "is_proprietary": true }, { "model": "gpt5.1", "bt_rank": 5, "win_rate": 80.6, "accuracy": 30.1, "acc_rank": 3, "is_proprietary": true }, { "model": "gemini3-flash", "bt_rank": 6, "win_rate": 76.5, "accuracy": 29.28, "acc_rank": 4, "is_proprietary": true }, { "model": "kimi-k2", "bt_rank": 7, "win_rate": 73.1, "accuracy": 30.17, "acc_rank": 2, "is_proprietary": false }, { "model": "run_api_deepseek_deepseek-chat", "bt_rank": 8, "win_rate": 70.5, "accuracy": 27.65, "acc_rank": 6, "is_proprietary": false }, { "model": "gemini2.5-pro", "bt_rank": 9, "win_rate": 63.9, "accuracy": 19.0, "acc_rank": 14, "is_proprietary": true }, { "model": "qwen3-next-80b-a3b-instruct", "bt_rank": 10, "win_rate": 59.5, "accuracy": 18.8, "acc_rank": 15, "is_proprietary": false }, { "model": "minimax-m2", "bt_rank": 11, "win_rate": 59.7, "accuracy": 23.52, "acc_rank": 10, "is_proprietary": false }, { "model": "glm4.6", "bt_rank": 12, "win_rate": 52.1, "accuracy": 23.84, "acc_rank": 9, "is_proprietary": false }, { "model": "qwen3", "bt_rank": 13, "win_rate": 51.7, "accuracy": 19.13, "acc_rank": 13, "is_proprietary": false }, { "model": "qwen2.5-14B-Instruct-1M", "bt_rank": 14, "win_rate": 40.3, "accuracy": 20, "acc_rank": 11, "is_proprietary": false }, { "model": "gemini2.5-flash-lite", "bt_rank": 15, "win_rate": 35.4, "accuracy": 16.64, "acc_rank": 18, "is_proprietary": true }, { "model": "qwen2.5-14B-Instruct", "bt_rank": 16, "win_rate": 32.4, "accuracy": 14.15, "acc_rank": 20, "is_proprietary": false }, { "model": "qwen2.5-32b-instruct", "bt_rank": 17, "win_rate": 32.3, "accuracy": 13.12, "acc_rank": 21, "is_proprietary": false }, { "model": "gemini2.5-flash", "bt_rank": 18, "win_rate": 31.2, "accuracy": 18.61, "acc_rank": 16, "is_proprietary": true }, { "model": "qwen2.5-72B-Instruct", "bt_rank": 19, "win_rate": 29.5, "accuracy": 14.92, "acc_rank": 19, "is_proprietary": false }, { "model": "qwen3-4B-Instruct-2507", "bt_rank": 20, "win_rate": 27.3, "accuracy": 16.93, "acc_rank": 17, "is_proprietary": false }, { "model": "qwen2.5-7B-Instruct-1M", "bt_rank": 21, "win_rate": 17.3, "accuracy": 20, "acc_rank": 12, "is_proprietary": false }, { "model": "llama3.3-70B", "bt_rank": 22, "win_rate": 14.2, "accuracy": 7.3, "acc_rank": 22, "is_proprietary": false } ], "10K": [ { "model": "claude4.5-sonnet", "bt_rank": 1, "win_rate": 92.8, "accuracy": 69.26, "acc_rank": 1, "is_proprietary": true }, { "model": "run_api_deepseek_deepseek-chat", "bt_rank": 2, "win_rate": 80.6, "accuracy": 49.41, "acc_rank": 2, "is_proprietary": false }, { "model": "gpt5mini", "bt_rank": 3, "win_rate": 80.4, "accuracy": 41.56, "acc_rank": 5, "is_proprietary": true }, { "model": "gpt5.2", "bt_rank": 4, "win_rate": 78.0, "accuracy": 43.11, "acc_rank": 4, "is_proprietary": true }, { "model": "kimi-k2", "bt_rank": 5, "win_rate": 77.0, "accuracy": 41.17, "acc_rank": 7, "is_proprietary": false }, { "model": "glm4.6", "bt_rank": 6, "win_rate": 71.4, "accuracy": 48.29, "acc_rank": 3, "is_proprietary": false }, { "model": "gemini3-flash", "bt_rank": 7, "win_rate": 63.6, "accuracy": 39.5, "acc_rank": 8, "is_proprietary": true }, { "model": "qwen3-next-80b-a3b-instruct", "bt_rank": 8, "win_rate": 59.2, "accuracy": 38.34, "acc_rank": 9, "is_proprietary": false }, { "model": "minimax-m2", "bt_rank": 9, "win_rate": 54.4, "accuracy": 35.74, "acc_rank": 10, "is_proprietary": false }, { "model": "gpt5.1", "bt_rank": 10, "win_rate": 54.0, "accuracy": 41.23, "acc_rank": 6, "is_proprietary": true }, { "model": "qwen3", "bt_rank": 11, "win_rate": 51.0, "accuracy": 28.23, "acc_rank": 12, "is_proprietary": false }, { "model": "qwen2.5-14B-Instruct-1M", "bt_rank": 12, "win_rate": 45.6, "accuracy": 20, "acc_rank": 15, "is_proprietary": false }, { "model": "gemini2.5-pro", "bt_rank": 13, "win_rate": 44.8, "accuracy": 20.91, "acc_rank": 13, "is_proprietary": true }, { "model": "qwen2.5-32b-instruct", "bt_rank": 14, "win_rate": 41.2, "accuracy": 17.83, "acc_rank": 17, "is_proprietary": false }, { "model": "qwen2.5-72B-Instruct", "bt_rank": 15, "win_rate": 34.6, "accuracy": 20.79, "acc_rank": 14, "is_proprietary": false }, { "model": "qwen2.5-14B-Instruct", "bt_rank": 16, "win_rate": 31.6, "accuracy": 14.65, "acc_rank": 18, "is_proprietary": false }, { "model": "qwen3-4B-Instruct-2507", "bt_rank": 17, "win_rate": 30.0, "accuracy": 30.43, "acc_rank": 11, "is_proprietary": false }, { "model": "gemini2.5-flash-lite", "bt_rank": 18, "win_rate": 29.6, "accuracy": 14.37, "acc_rank": 19, "is_proprietary": true }, { "model": "qwen2.5-7B-Instruct-1M", "bt_rank": 19, "win_rate": 27.4, "accuracy": 20, "acc_rank": 16, "is_proprietary": false }, { "model": "gemini2.5-flash", "bt_rank": 20, "win_rate": 25.2, "accuracy": 12.61, "acc_rank": 20, "is_proprietary": true }, { "model": "qwen2.5-7B-Instruct", "bt_rank": 21, "win_rate": 22.0, "accuracy": 7.53, "acc_rank": 21, "is_proprietary": false }, { "model": "llama3.3-70B", "bt_rank": 22, "win_rate": 18.6, "accuracy": 6.51, "acc_rank": 22, "is_proprietary": false } ], "GLOBEM": [ { "model": "claude4.5-sonnet", "bt_rank": 1, "win_rate": 93.0, "accuracy": 39.54, "acc_rank": 2, "is_proprietary": true }, { "model": "gpt5-mini", "bt_rank": 2, "win_rate": 60.0, "accuracy": 33.91, "acc_rank": 12, "is_proprietary": true }, { "model": "gemini3-flash", "bt_rank": 3, "win_rate": 81.2, "accuracy": 35.46, "acc_rank": 9, "is_proprietary": true }, { "model": "minimax-m2", "bt_rank": 4, "win_rate": 77.8, "accuracy": 36.9, "acc_rank": 6, "is_proprietary": false }, { "model": "gpt5mini", "bt_rank": 5, "win_rate": 73.8, "accuracy": 33.91, "acc_rank": 13, "is_proprietary": true }, { "model": "gpt5.1", "bt_rank": 6, "win_rate": 67.5, "accuracy": 36.76, "acc_rank": 7, "is_proprietary": true }, { "model": "gpt5.2", "bt_rank": 7, "win_rate": 64.4, "accuracy": 38.39, "acc_rank": 3, "is_proprietary": true }, { "model": "qwen3", "bt_rank": 8, "win_rate": 64.7, "accuracy": 36.32, "acc_rank": 8, "is_proprietary": false }, { "model": "run_api_deepseek_deepseek-chat", "bt_rank": 9, "win_rate": 64.5, "accuracy": 38.39, "acc_rank": 4, "is_proprietary": false }, { "model": "glm4.6", "bt_rank": 10, "win_rate": 53.6, "accuracy": 39.77, "acc_rank": 1, "is_proprietary": false }, { "model": "kimi-k2", "bt_rank": 11, "win_rate": 52.2, "accuracy": 37.01, "acc_rank": 5, "is_proprietary": false }, { "model": "gemini2.5-pro", "bt_rank": 12, "win_rate": 45.6, "accuracy": 34.6, "acc_rank": 10, "is_proprietary": true }, { "model": "qwen2.5-72B-Instruct", "bt_rank": 13, "win_rate": 43.3, "accuracy": 27.13, "acc_rank": 14, "is_proprietary": false }, { "model": "qwen2.5-32B-Instruct", "bt_rank": 14, "win_rate": 42.1, "accuracy": 20, "acc_rank": 20, "is_proprietary": false }, { "model": "qwen3-next-80b-a3b-instruct", "bt_rank": 15, "win_rate": 41.5, "accuracy": 34.14, "acc_rank": 11, "is_proprietary": false }, { "model": "qwen2.5-14B-Instruct", "bt_rank": 16, "win_rate": 40.8, "accuracy": 26.13, "acc_rank": 16, "is_proprietary": false }, { "model": "gemini2.5-flash-lite", "bt_rank": 17, "win_rate": 37.4, "accuracy": 25.52, "acc_rank": 18, "is_proprietary": true }, { "model": "qwen3-4B-Instruct-2507", "bt_rank": 18, "win_rate": 36.6, "accuracy": 26.9, "acc_rank": 15, "is_proprietary": false }, { "model": "qwen2.5-14B-Instruct-1M", "bt_rank": 19, "win_rate": 32.0, "accuracy": 20, "acc_rank": 21, "is_proprietary": false }, { "model": "llama3.3-70B", "bt_rank": 20, "win_rate": 28.1, "accuracy": 22.65, "acc_rank": 19, "is_proprietary": false }, { "model": "qwen2.5-7B-Instruct", "bt_rank": 21, "win_rate": 22.2, "accuracy": 25.64, "acc_rank": 17, "is_proprietary": false }, { "model": "qwen2.5-7B-Instruct-1M", "bt_rank": 22, "win_rate": 19.7, "accuracy": 20, "acc_rank": 22, "is_proprietary": false } ] }, turn: { "mimic": [ { "model": "claude4.5-sonnet", "median": 52, "distribution": [ 0.0, 0.0, 1.0, 5.0, 31.0, 43.0, 13.0, 7.0, 0.0, 0.0 ] }, { "model": "qwen3", "median": 43, "distribution": [ 0.0, 1.0, 12.0, 29.0, 13.0, 9.0, 3.0, 2.0, 0.0, 31.0 ] }, { "model": "gpt5-mini", "median": 39, "distribution": [ 0.0, 0.0, 9.0, 42.0, 36.0, 12.0, 1.0, 0.0, 0.0, 0.0 ] }, { "model": "glm4.6", "median": 39, "distribution": [ 0.0, 6.3, 23.4, 20.7, 7.2, 13.5, 3.6, 6.3, 4.5, 14.4 ] }, { "model": "run_api_deepseek_deepseek-chat", "median": 33, "distribution": [ 0.0, 2.0, 22.0, 60.0, 16.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5.2", "median": 30, "distribution": [ 0.0, 10.0, 36.0, 32.0, 12.0, 10.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5.1", "median": 23, "distribution": [ 1.5, 39.7, 29.4, 19.9, 9.6, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "kimi-k2", "median": 19, "distribution": [ 0.0, 55.0, 44.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "minimax-m2", "median": 18, "distribution": [ 0.0, 70.0, 30.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-note", "median": 17, "distribution": [ 12.0, 52.0, 24.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-pro", "median": 15, "distribution": [ 10.6, 70.2, 19.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini3-flash", "median": 15, "distribution": [ 7.0, 71.0, 22.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-4B-Instruct-2507", "median": 14, "distribution": [ 0.0, 98.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-noreasoning", "median": 14, "distribution": [ 7.0, 68.0, 22.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct", "median": 12, "distribution": [ 23.0, 62.0, 12.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0 ] }, { "model": "qwen2.5-72B-Instruct", "median": 11, "distribution": [ 15.0, 85.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-longreasoning", "median": 11, "distribution": [ 24.0, 74.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-32b-instruct", "median": 11, "distribution": [ 33.0, 67.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash-lite", "median": 11, "distribution": [ 29.0, 71.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash", "median": 10, "distribution": [ 34.0, 65.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct-1M", "median": 10, "distribution": [ 34.0, 65.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-shortreasoning", "median": 9, "distribution": [ 64.0, 36.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct", "median": 8, "distribution": [ 73.0, 27.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct", "median": 7, "distribution": [ 90.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "llama-3.3-70B", "median": 6, "distribution": [ 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "llama3.3-70B", "median": 6, "distribution": [ 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct-1M", "median": 4, "distribution": [ 91.0, 9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] } ], "10k": [ { "model": "claude4.5-sonnet", "median": 56, "distribution": [ 0.0, 0.0, 1.0, 6.0, 13.0, 44.0, 27.0, 6.0, 3.0, 0.0 ] }, { "model": "glm4.6", "median": 52, "distribution": [ 0.0, 0.0, 3.8, 10.4, 27.4, 27.4, 18.9, 5.7, 4.7, 1.9 ] }, { "model": "run_api_deepseek_deepseek-chat", "median": 39, "distribution": [ 0.0, 0.0, 11.0, 40.0, 37.0, 9.0, 3.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5mini", "median": 35, "distribution": [ 0.0, 4.0, 27.8, 36.5, 24.6, 6.3, 0.8, 0.0, 0.0, 0.0 ] }, { "model": "qwen3", "median": 26, "distribution": [ 0.8, 25.4, 30.2, 7.9, 2.4, 0.0, 0.0, 0.8, 0.0, 32.5 ] }, { "model": "kimi-k2", "median": 24, "distribution": [ 0.0, 29.0, 48.0, 21.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "minimax-m2", "median": 20, "distribution": [ 0.0, 43.0, 48.0, 9.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct", "median": 20, "distribution": [ 0.0, 46.0, 27.0, 11.0, 2.0, 6.0, 3.0, 1.0, 1.0, 3.0 ] }, { "model": "gpt5.2", "median": 20, "distribution": [ 0.0, 43.0, 41.0, 12.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5.1", "median": 17, "distribution": [ 1.0, 69.0, 29.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-note", "median": 16, "distribution": [ 17.0, 44.0, 27.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-pro", "median": 15, "distribution": [ 7.0, 73.0, 18.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash-lite", "median": 14, "distribution": [ 14.0, 78.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini3-flash", "median": 13, "distribution": [ 10.0, 82.0, 7.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash", "median": 12, "distribution": [ 21.0, 69.0, 8.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-4B-Instruct-2507", "median": 12, "distribution": [ 4.0, 91.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct-1M", "median": 11, "distribution": [ 31.0, 64.3, 4.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-longreasoning", "median": 11, "distribution": [ 28.0, 67.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-32b-instruct", "median": 10, "distribution": [ 34.1, 65.9, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-noreasoning", "median": 9, "distribution": [ 58.0, 41.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct", "median": 9, "distribution": [ 58.0, 42.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-shortreasoning", "median": 8, "distribution": [ 81.0, 19.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct-1M", "median": 8, "distribution": [ 70.0, 29.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-72B-Instruct", "median": 7, "distribution": [ 75.0, 25.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct", "median": 7, "distribution": [ 84.0, 16.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "llama3.3-70B", "median": 1, "distribution": [ 92.0, 7.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] } ], "globem": [ { "model": "claude4.5-sonnet", "median": 25, "distribution": [ 0.0, 6.0, 87.0, 7.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini3-flash", "median": 21, "distribution": [ 2.0, 36.0, 58.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "glm4.6", "median": 21, "distribution": [ 0.0, 23.0, 66.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "run_api_deepseek_deepseek-chat", "median": 20, "distribution": [ 0.0, 32.0, 68.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-note", "median": 19, "distribution": [ 16.0, 36.0, 33.0, 9.0, 5.0, 1.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3", "median": 19, "distribution": [ 0.0, 50.0, 38.0, 9.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0 ] }, { "model": "minimax-m2", "median": 17, "distribution": [ 0.0, 80.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5-mini", "median": 17, "distribution": [ 2.0, 78.0, 20.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "kimi-k2", "median": 17, "distribution": [ 0.0, 82.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-32B-Instruct", "median": 15, "distribution": [ 1.0, 84.0, 14.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5.2", "median": 15, "distribution": [ 0.0, 92.0, 8.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-72B-Instruct", "median": 14, "distribution": [ 4.0, 78.0, 17.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash-lite", "median": 14, "distribution": [ 7.0, 80.0, 12.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct-1M", "median": 14, "distribution": [ 13.0, 66.0, 16.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-14B-Instruct", "median": 13, "distribution": [ 16.0, 82.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct", "median": 12, "distribution": [ 0.0, 99.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-pro", "median": 12, "distribution": [ 3.0, 94.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct-1M", "median": 12, "distribution": [ 18.0, 73.0, 7.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gemini2.5-flash", "median": 12, "distribution": [ 15.0, 85.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-4B-Instruct-2507", "median": 12, "distribution": [ 12.0, 83.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "gpt5.1", "median": 11, "distribution": [ 30.0, 70.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-noreasoning", "median": 9, "distribution": [ 57.0, 42.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-longreasoning", "median": 9, "distribution": [ 69.0, 30.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen3-next-80b-a3b-instruct-shortreasoning", "median": 9, "distribution": [ 66.0, 34.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "qwen2.5-7B-Instruct", "median": 9, "distribution": [ 53.0, 45.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0 ] }, { "model": "llama3.3-70B", "median": 6, "distribution": [ 98.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ] } ] }, probing: { "byTurn": { "mimic": { "Qwen2.5-32B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -4.3, -4.21, -4.04, -3.87, -3.59, -3.62, -3.33, -3.4, -2.93, -3.21 ], "sem": [ 0.25, 0.27, 0.32, 0.35, 0.35, 0.36, 0.34, 0.35, 0.32, 0.4 ] }, "Qwen2.5-72B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -3.51, -3.98, -3.68, -3.8, -3.26, -3.22, -3.12, -3.24, -3.08, -2.84 ], "sem": [ 0.15, 0.21, 0.21, 0.23, 0.23, 0.21, 0.25, 0.25, 0.28, 0.08 ] }, "Qwen3-4B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -3.48, -3.25, -3.3, -2.74, -2.75, -2.73, -2.72, -2.67, -2.62, -2.25 ], "sem": [ 0.04, 0.05, 0.04, 0.07, 0.06, 0.07, 0.07, 0.07, 0.06, 0.06 ] }, "Qwen3-30B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -4.94, -5.21, -5.51, -5.05, -4.96, -4.95, -4.75, -4.73, -4.6, -4.72 ], "sem": [ 0.15, 0.18, 0.2, 0.18, 0.19, 0.19, 0.17, 0.18, 0.16, 0.18 ] }, "Qwen3-Next-80B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -2.85, -2.86, -2.74, -2.65, -2.31, -2.14, -1.98, -2.03, -1.88, -1.82 ], "sem": [ 0.1, 0.1, 0.11, 0.11, 0.11, 0.13, 0.13, 0.18, 0.17, 0.09 ] } }, "globem": { "Qwen2.5-32B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -5.48, -5.83, -5.84, -5.91, -6.01, -6.03, -5.86, -5.73, -5.78, -5.73 ], "sem": [ 0.24, 0.28, 0.31, 0.33, 0.33, 0.35, 0.33, 0.35, 0.35, 0.36 ] }, "Qwen2.5-72B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -4.68, -5.56, -5.65, -5.59, -5.59, -5.49, -5.54, -5.4, -5.57, -5.53 ], "sem": [ 0.13, 0.18, 0.23, 0.23, 0.25, 0.25, 0.29, 0.32, 0.38, 0.46 ] }, "Qwen3-4B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -4.67, -4.16, -3.9, -3.76, -3.6, -3.47, -3.05, -2.99, -2.93, -2.78 ], "sem": [ 0.08, 0.07, 0.06, 0.06, 0.07, 0.08, 0.07, 0.08, 0.08, 0.09 ] }, "Qwen3-30B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -5.28, -5.23, -5.2, -5.19, -5.2, -5.01, -5.21, -4.95, -4.93, -4.81 ], "sem": [ 0.09, 0.09, 0.09, 0.08, 0.08, 0.08, 0.09, 0.09, 0.1, 0.1 ] }, "Qwen3-Next-80B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46, -1.89 ], "sem": [ 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14, 0.1 ] } }, "10k": { "Qwen2.5-32B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -6.59, -7.15, -6.99, -6.95, -6.82, -6.88, -6.71, -6.58, -6.67, -6.45 ], "sem": [ 0.26, 0.28, 0.29, 0.3, 0.29, 0.29, 0.29, 0.32, 0.36, 0.41 ] }, "Qwen2.5-72B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -5.51, -7.02, -6.45, -6.11, -5.98, -6.52, -7.02, -7.88, -8.05, -7.66 ], "sem": [ 0.26, 0.34, 0.34, 0.36, 0.4, 0.53, 0.62, 0.71, 0.81, 0.92 ] }, "Qwen3-4B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -4.68, -4.3, -3.57, -3.33, -3.27, -3.22, -3.06, -2.9, -2.75, -2.57 ], "sem": [ 0.18, 0.17, 0.15, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.14 ] }, "Qwen3-30B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -3.23, -3.31, -3.28, -3.16, -3.06, -2.97, -2.94, -2.87, -2.83, -2.73 ], "sem": [ 0.17, 0.17, 0.17, 0.17, 0.17, 0.16, 0.17, 0.18, 0.18, 0.17 ] }, "Qwen3-Next-80B-A3B": { "turns": [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ], "logprob": [ -3.25, -3.42, -3.21, -2.94, -2.81, -2.75, -2.7, -2.65, -2.55, -2.45 ], "sem": [ 0.16, 0.17, 0.17, 0.17, 0.16, 0.17, 0.16, 0.16, 0.16, 0.16 ] } } }, "byProgress": { "mimic": { "Qwen2.5-32B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -4.3, -4.12, -3.73, -3.62, -3.36, -3.05, -2.94, -3.12, -4.6, -4.42 ], "sem": [ 0.25, 0.21, 0.25, 0.36, 0.24, 0.25, 0.38, 0.45, 1.5, 0.1 ] }, "Qwen2.5-72B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -3.51, -3.98, -3.74, -3.26, -3.17, -3.24, -2.99, -2.53, -2.58, -2.42 ], "sem": [ 0.15, 0.21, 0.16, 0.23, 0.17, 0.25, 0.18, 0.09, 0.09, 0.2 ] }, "Qwen3-4B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -3.37, -2.93, -2.71, -2.33, -1.99, -2.04, -1.57, -1.46, -1.48, -1.44 ], "sem": [ 0.03, 0.04, 0.04, 0.04, 0.05, 0.08, 0.1, 0.05, 0.0, 0.01 ] }, "Qwen3-30B-A3B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -5.13, -4.72, -4.42, -4.17, -4.04, -3.9, -3.64, -3.45, -3.36, -3.17 ], "sem": [ 0.08, 0.07, 0.07, 0.07, 0.07, 0.08, 0.1, 0.14, 0.15, 0.26 ] }, "Qwen3-Next-80B-A3B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -2.85, -2.8, -2.65, -2.22, -1.98, -1.96, -1.79, -1.74, -1.83, -1.85 ], "sem": [ 0.1, 0.07, 0.11, 0.09, 0.13, 0.12, 0.08, 0.16, 0.15, 0.39 ] } }, "globem": { "Qwen2.5-32B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -5.66, -5.92, -5.88, -5.79, -5.79, -5.55, -5.47, -4.8, -3.55, -3.24 ], "sem": [ 0.18, 0.19, 0.2, 0.21, 0.29, 0.29, 0.47, 0.63, 0.19, 0.47 ] }, "Qwen2.5-72B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -4.68, -5.56, -5.62, -5.59, -5.51, -5.4, -5.56, -5.03, -5.77, -7.71 ], "sem": [ 0.13, 0.18, 0.16, 0.25, 0.19, 0.32, 0.29, 0.55, 0.83, 0.1 ] }, "Qwen3-4B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -4.42, -3.83, -3.38, -2.96, -2.71, -2.6, -2.46, -2.53, -2.63, -2.61 ], "sem": [ 0.06, 0.04, 0.04, 0.05, 0.07, 0.08, 0.12, 0.14, 0.25, 0.04 ] }, "Qwen3-30B-A3B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -5.26, -5.2, -5.06, -4.82, -4.5, -4.51, -4.37, -4.1, -4.03, -3.74 ], "sem": [ 0.06, 0.05, 0.05, 0.06, 0.07, 0.08, 0.1, 0.29, 0.25, 0.11 ] }, "Qwen3-Next-80B-A3B": { "progress": [ 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -3.1, -3.15, -3.06, -3.01, -2.95, -2.88, -2.78, -2.4, -2.46 ], "sem": [ 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.07, 0.06, 0.14 ] } }, "10k": { "Qwen2.5-32B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -6.59, -7.07, -6.89, -6.8, -6.58, -6.58, -6.76, -8.0, -8.59, -8.83 ], "sem": [ 0.26, 0.2, 0.21, 0.2, 0.32, 0.27, 0.39, 0.57, 0.84, 1.12 ] }, "Qwen2.5-72B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -5.51, -7.02, -6.28, -5.98, -6.52, -7.33, -8.05, -7.85, -8.41, -7.15 ], "sem": [ 0.26, 0.34, 0.25, 0.4, 0.53, 0.47, 0.81, 0.79, 1.45, 1.26 ] }, "Qwen3-4B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -4.49, -3.45, -3.19, -2.83, -2.5, -2.27, -2.31, -2.31, -2.35, -1.73 ], "sem": [ 0.12, 0.1, 0.08, 0.1, 0.1, 0.11, 0.2, 0.29, 0.36, 0.03 ] }, "Qwen3-30B-A3B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -3.14, -2.66, -2.29, -2.26, -1.97, -1.88, -1.52, -1.36, -1.61, -1.61 ], "sem": [ 0.06, 0.06, 0.07, 0.1, 0.14, 0.18, 0.08, 0.02, 0.05, 0.08 ] }, "Qwen3-Next-80B-A3B": { "progress": [ 0, 10, 20, 30, 40, 50, 60, 70, 80, 90 ], "logprob": [ -3.34, -2.99, -2.7, -2.5, -2.43, -2.55, -2.18, -2.28, -2.19, -2.5 ], "sem": [ 0.12, 0.1, 0.1, 0.11, 0.11, 0.15, 0.21, 0.22, 0.26, 0.38 ] } } } }, probingColors: { "Qwen2.5-32B": "#4A90D9", "Qwen2.5-72B": "#1A5FB4", "Qwen3-4B": "#57E389", "Qwen3-30B-A3B": "#26A269", "Qwen3-Next-80B-A3B": "#9141AC" }, error: [ { "main_category": "Fail in Exploration", "subcategory": "Insufficient Breadth", "count": 64, "percentage": 31.1, "color": "#1565C0" }, { "main_category": "Fail in Exploration", "subcategory": "Insufficient Depth", "count": 56, "percentage": 27.2, "color": "#42A5F5" }, { "main_category": "Poor Data-to-Insight", "subcategory": "Insight Misinterpretation", "count": 19, "percentage": 9.2, "color": "#2E7D32" }, { "main_category": "Poor Data-to-Insight", "subcategory": "Superficial Analysis", "count": 16, "percentage": 7.8, "color": "#43A047" }, { "main_category": "Poor Data-to-Insight", "subcategory": "Over Reasoning", "count": 15, "percentage": 7.3, "color": "#81C784" }, { "main_category": "Lost in Context", "subcategory": "Lost in Debugging", "count": 18, "percentage": 8.7, "color": "#C62828" }, { "main_category": "Lost in Context", "subcategory": "Fail in Summarization", "count": 10, "percentage": 4.9, "color": "#E53935" }, { "main_category": "Lost in Context", "subcategory": "Poor Instruction Following", "count": 8, "percentage": 3.9, "color": "#EF9A9A" } ] };