Food Desert commited on
Commit
06a3c46
·
1 Parent(s): 30bedf0

Switch Stage3 to explicit-only no-why selection, drop bear probe, and set k=1 defaults

Browse files
Files changed (49) hide show
  1. app.py +246 -28
  2. data/analysis/simplified_probe_tags.csv +1 -1
  3. data/eval_results/k1_default_recheck_seed42_n10.jsonl +11 -0
  4. data/eval_results/k_sweep_explicit_no_why_seed42_k1.jsonl +11 -0
  5. data/eval_results/k_sweep_explicit_no_why_seed42_k10.jsonl +0 -0
  6. data/eval_results/k_sweep_explicit_no_why_seed42_k2.jsonl +11 -0
  7. data/eval_results/k_sweep_explicit_no_why_seed42_k3.jsonl +11 -0
  8. data/eval_results/k_sweep_explicit_no_why_seed42_k4.jsonl +11 -0
  9. data/eval_results/k_sweep_explicit_no_why_seed42_k6.jsonl +0 -0
  10. data/eval_results/latency_baseline_seed42.jsonl +11 -0
  11. data/eval_results/latency_baseline_seed43.jsonl +11 -0
  12. data/eval_results/latency_chunk100_seed42.jsonl +11 -0
  13. data/eval_results/latency_chunk60_k6_seed42.jsonl +11 -0
  14. data/eval_results/latency_chunk60_k6_seed43.jsonl +11 -0
  15. data/eval_results/latency_k1_seed42.jsonl +11 -0
  16. data/eval_results/latency_k1_seed43.jsonl +11 -0
  17. data/eval_results/latency_k4_seed43.jsonl +11 -0
  18. data/eval_results/latency_single_shot_seed42.jsonl +11 -0
  19. data/eval_results/smoke_no_why_explicit_only_n1.jsonl +2 -0
  20. data/eval_results/smoke_no_why_explicit_only_n1_v2.jsonl +2 -0
  21. data/eval_results/why_gate_compare_explicit_n10.jsonl +11 -0
  22. data/eval_results/why_gate_compare_strong_implied_n10.jsonl +11 -0
  23. data/runtime_debug/eval_no_why_explicit_instruction_n10_20260303T005633Z.json +222 -0
  24. data/runtime_debug/eval_no_why_n10_20260302T210359Z.json +308 -0
  25. data/runtime_debug/false_positive_case_review_looking_anthro_bear_20260304.md +159 -0
  26. data/runtime_debug/llm_capture_20260302T162119Z/input_prompt.txt +1 -0
  27. data/runtime_debug/llm_capture_20260302T162202Z/input_prompt.txt +1 -0
  28. data/runtime_debug/llm_capture_20260302T162202Z/structural_request.json +13 -0
  29. data/runtime_debug/llm_capture_20260302T162202Z/structural_response_parsed.json +16 -0
  30. data/runtime_debug/llm_capture_20260302T162202Z/structural_response_raw.txt +1 -0
  31. data/runtime_debug/llm_capture_20260302T162249Z/input_prompt.txt +1 -0
  32. data/runtime_debug/llm_capture_20260302T162249Z/probe_request.json +14 -0
  33. data/runtime_debug/llm_capture_20260302T162249Z/probe_response_parsed.json +28 -0
  34. data/runtime_debug/llm_capture_20260302T162249Z/probe_response_raw.txt +10 -0
  35. data/runtime_debug/llm_capture_20260302T162249Z/selection_request.json +38 -0
  36. data/runtime_debug/llm_capture_20260302T162249Z/selection_response_parsed.json +3 -0
  37. data/runtime_debug/llm_capture_20260302T162249Z/selection_response_raw.txt +1 -0
  38. data/runtime_debug/llm_capture_20260302T162249Z/structural_request.json +13 -0
  39. data/runtime_debug/llm_capture_20260302T162249Z/structural_response_parsed.json +19 -0
  40. data/runtime_debug/llm_capture_20260302T162249Z/structural_response_raw.txt +1 -0
  41. data/runtime_debug/llm_capture_20260302T162249Z/summary.json +51 -0
  42. data/runtime_debug/selection_why_vs_no_why_20260302T191813Z.json +217 -0
  43. data/runtime_debug/whyless_replication_seeds_42_43_20260303T060318Z.json +123 -0
  44. data/runtime_metrics/ui_pipeline_timings.jsonl +3 -0
  45. data/structural_tag_definitions.csv +8 -8
  46. psq_rag/llm/select.py +1243 -1291
  47. psq_rag/retrieval/psq_retrieval.py +1 -1
  48. psq_rag/ui/group_ranked_display.py +198 -0
  49. scripts/eval_pipeline.py +2 -2
app.py CHANGED
@@ -1,16 +1,20 @@
1
  import gradio as gr
2
  import os
3
  import logging
 
 
 
4
  from PIL import Image
5
  from pathlib import Path
6
  from typing import List
7
- from concurrent.futures import ThreadPoolExecutor
8
 
9
  from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
10
  from psq_rag.llm.rewrite import llm_rewrite_prompt
11
  from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
12
  from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
13
  from psq_rag.retrieval.state import expand_tags_via_implications
 
14
 
15
 
16
  def _split_prompt_commas(s: str) -> List[str]:
@@ -80,6 +84,18 @@ os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
80
  MASCOT_DIR = Path(__file__).parent / "mascotimages"
81
  MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  try:
84
  from gradio_client import utils as _gc_utils
85
 
@@ -115,10 +131,39 @@ except Exception as e:
115
 
116
 
117
  allow_nsfw_tags = False
118
- verbose_retrieval = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  verbose_retrieval_all = False
120
  verbose_retrieval_limit = 20
121
  enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  css = """
124
  .scrollable-content{
@@ -147,21 +192,110 @@ css = """
147
  """
148
 
149
 
150
- def rag_pipeline_ui(user_prompt: str):
 
 
 
 
 
151
  logs = []
152
  def log(s): logs.append(s)
153
 
154
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  log("Start: received prompt")
156
  prompt_in = (user_prompt or "").strip()
157
  if not prompt_in:
158
- return "Error: empty prompt", ""
159
 
160
  log("Input:")
161
  log(prompt_in)
162
  log("")
 
 
 
 
 
 
 
 
 
 
163
 
 
164
  user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
 
 
 
165
  log("Heuristically extracted user tags:")
166
  if user_tags:
167
  log(", ".join(user_tags))
@@ -176,9 +310,16 @@ def rag_pipeline_ui(user_prompt: str):
176
  fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
177
  fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
178
 
179
- rewritten = fut_rewrite.result()
180
- structural_tags = fut_struct.result()
181
- probe_tags = fut_probe.result() if fut_probe else []
 
 
 
 
 
 
 
182
 
183
  log("Rewrite:")
184
  log(rewritten if rewritten else "(empty)")
@@ -192,19 +333,28 @@ def rag_pipeline_ui(user_prompt: str):
192
 
193
  log("Step 2: Prompt Squirrel retrieval (hidden)")
194
  try:
 
195
  retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
196
  rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
197
  retrieval_result = psq_candidates_from_rewrite_phrases(
198
  rewrite_phrases=rewrite_phrases,
199
  allow_nsfw_tags=allow_nsfw_tags,
200
  context_tags=retrieval_context_tags,
201
- global_k=300,
 
 
202
  verbose=verbose_retrieval,
203
  )
204
  if isinstance(retrieval_result, tuple):
205
  candidates, phrase_reports = retrieval_result
206
  else:
207
  candidates, phrase_reports = retrieval_result, []
 
 
 
 
 
 
208
  log(f"Retrieved {len(candidates)} candidate tags")
209
  if verbose_retrieval:
210
  log(f"Total unique candidates: {len(candidates)}")
@@ -255,12 +405,20 @@ def rag_pipeline_ui(user_prompt: str):
255
  structural_tags=structural_tags,
256
  probe_tags=probe_tags,
257
  )
258
- picked_indices = llm_select_indices(
259
- query_text=selection_query,
260
- candidates=candidates,
261
- max_pick=0,
262
- log=log,
263
- )
 
 
 
 
 
 
 
 
264
 
265
  selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
266
 
@@ -282,8 +440,12 @@ def rag_pipeline_ui(user_prompt: str):
282
  log(" No probe tags inferred")
283
 
284
  log("Step 3c: Expand via tag implications")
 
285
  tag_set = set(selected_tags)
286
  expanded, implied_only = expand_tags_via_implications(tag_set)
 
 
 
287
  if implied_only:
288
  selected_tags.extend(sorted(implied_only))
289
  log(f" Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
@@ -291,14 +453,41 @@ def rag_pipeline_ui(user_prompt: str):
291
  log(" No additional implied tags")
292
 
293
  log("Step 4: Compose final prompt")
 
294
  final_prompt = compose_final_prompt(rewritten, selected_tags)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
 
 
 
 
296
  log("Done: final prompt ready")
297
- return "\n".join(logs), final_prompt
298
 
299
  except Exception as e:
300
  log(f"Error: {type(e).__name__}: {e}")
301
- return "\n".join(logs), ""
302
 
303
 
304
 
@@ -311,14 +500,17 @@ with gr.Blocks(css=css) as app:
311
  lines=1
312
  )
313
  with gr.Column(scale=1):
314
- _mascot_pil = Image.open(MASCOT_FILE).convert("RGBA")
315
- mascot_img = gr.Image(
316
- value=_mascot_pil,
317
- show_label=False,
318
- interactive=False,
319
- height=220,
320
- elem_id="mascot"
321
- )
 
 
 
322
  submit_button = gr.Button("Run", variant="primary")
323
 
324
  gr.Markdown(
@@ -344,16 +536,42 @@ then returns a cleaned, model-friendly prompt.
344
  placeholder="Your optimized prompt will appear here."
345
  )
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  submit_button.click(
348
  rag_pipeline_ui,
349
- inputs=[image_tags],
350
- outputs=[console, final_prompt]
351
  )
352
 
353
  image_tags.submit(
354
  rag_pipeline_ui,
355
- inputs=[image_tags],
356
- outputs=[console, final_prompt]
357
  )
358
 
359
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import os
3
  import logging
4
+ import time
5
+ import json
6
+ from datetime import datetime
7
  from PIL import Image
8
  from pathlib import Path
9
  from typing import List
10
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError as FutureTimeoutError
11
 
12
  from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
13
  from psq_rag.llm.rewrite import llm_rewrite_prompt
14
  from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
15
  from psq_rag.llm.select import llm_select_indices, llm_infer_structural_tags, llm_infer_probe_tags
16
  from psq_rag.retrieval.state import expand_tags_via_implications
17
+ from psq_rag.ui.group_ranked_display import render_group_rankings_markdown
18
 
19
 
20
  def _split_prompt_commas(s: str) -> List[str]:
 
84
  MASCOT_DIR = Path(__file__).parent / "mascotimages"
85
  MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
86
 
87
+
88
+ def _load_mascot_image():
89
+ """Load mascot image if available; return None when missing/unreadable."""
90
+ if not MASCOT_FILE.exists():
91
+ logging.warning("Mascot image missing: %s", MASCOT_FILE)
92
+ return None
93
+ try:
94
+ return Image.open(MASCOT_FILE).convert("RGBA")
95
+ except Exception as e:
96
+ logging.warning("Failed to load mascot image (%s): %s", MASCOT_FILE, e)
97
+ return None
98
+
99
  try:
100
  from gradio_client import utils as _gc_utils
101
 
 
131
 
132
 
133
  allow_nsfw_tags = False
134
+ def _is_production_runtime() -> bool:
135
+ """Best-effort detection for deployed runtime (HF Spaces or explicit env)."""
136
+ if os.environ.get("PSQ_PRODUCTION", "").strip().lower() in {"1", "true", "yes"}:
137
+ return True
138
+ if os.environ.get("SPACE_ID"):
139
+ return True
140
+ if os.environ.get("HF_SPACE_ID"):
141
+ return True
142
+ if os.environ.get("SYSTEM") == "spaces":
143
+ return True
144
+ return False
145
+
146
+
147
+ verbose_retrieval_default = "0" if _is_production_runtime() else "1"
148
+ verbose_retrieval = os.environ.get("PSQ_VERBOSE_RETRIEVAL", verbose_retrieval_default).strip().lower() in {"1", "true", "yes"}
149
  verbose_retrieval_all = False
150
  verbose_retrieval_limit = 20
151
  enable_probe_tags = os.environ.get("PSQ_ENABLE_PROBE", "1").strip() not in {"0", "false", "False"}
152
+ display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
153
+ display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "5"))
154
+ display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "5"))
155
+ retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
156
+ retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
157
+ retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
158
+ selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
159
+ selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
160
+ selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
161
+ selection_candidate_cap = int(os.environ.get("PSQ_SELECTION_CANDIDATE_CAP", "0"))
162
+ stage1_rewrite_timeout_s = float(os.environ.get("PSQ_TIMEOUT_REWRITE_S", "45"))
163
+ stage1_struct_timeout_s = float(os.environ.get("PSQ_TIMEOUT_STRUCT_S", "45"))
164
+ stage1_probe_timeout_s = float(os.environ.get("PSQ_TIMEOUT_PROBE_S", "45"))
165
+ stage3_select_timeout_s = float(os.environ.get("PSQ_TIMEOUT_SELECT_S", "45"))
166
+ timing_log_path = Path(os.environ.get("PSQ_TIMING_LOG_PATH", "data/runtime_metrics/ui_pipeline_timings.jsonl"))
167
 
168
  css = """
169
  .scrollable-content{
 
192
  """
193
 
194
 
195
+ def rag_pipeline_ui(
196
+ user_prompt: str,
197
+ display_top_groups: float,
198
+ display_top_tags_per_group: float,
199
+ display_rank_top_k: float,
200
+ ):
201
  logs = []
202
  def log(s): logs.append(s)
203
 
204
  try:
205
+ stage_timings = {}
206
+
207
+ def _record_timing(stage: str, dt_s: float):
208
+ stage_timings[stage] = float(dt_s)
209
+
210
+ def _emit_timing_summary(total_s: float):
211
+ summary_order = [
212
+ "preprocess",
213
+ "rewrite",
214
+ "structural",
215
+ "probe",
216
+ "retrieval",
217
+ "selection",
218
+ "implication_expansion",
219
+ "prompt_composition",
220
+ "group_display",
221
+ ]
222
+ lines = []
223
+ for k in summary_order:
224
+ if k in stage_timings:
225
+ lines.append(f"{k}={stage_timings[k]:.2f}s")
226
+ slowest = max(stage_timings.items(), key=lambda kv: kv[1])[0] if stage_timings else "n/a"
227
+ log("Timing Summary: " + ", ".join(lines))
228
+ log(f"Timing Slowest Stage: {slowest}")
229
+ log(f"Timing Total: {total_s:.2f}s")
230
+
231
+ def _append_timing_jsonl(total_s: float):
232
+ try:
233
+ timing_log_path.parent.mkdir(parents=True, exist_ok=True)
234
+ rec = {
235
+ "timestamp_utc": datetime.utcnow().isoformat(timespec="seconds") + "Z",
236
+ "stages_s": stage_timings,
237
+ "total_s": float(total_s),
238
+ "config": {
239
+ "timeout_rewrite_s": stage1_rewrite_timeout_s,
240
+ "timeout_struct_s": stage1_struct_timeout_s,
241
+ "timeout_probe_s": stage1_probe_timeout_s,
242
+ "timeout_select_s": stage3_select_timeout_s,
243
+ },
244
+ }
245
+ with timing_log_path.open("a", encoding="utf-8") as f:
246
+ f.write(json.dumps(rec, ensure_ascii=True) + "\n")
247
+ log(f"Timing Log: wrote {timing_log_path}")
248
+ except Exception as e:
249
+ log(f"Timing Log: failed ({type(e).__name__}: {e})")
250
+
251
+ def _future_with_timeout(fut, timeout_s: float, stage_name: str, fallback):
252
+ t0 = time.perf_counter()
253
+ try:
254
+ out = fut.result(timeout=max(1.0, float(timeout_s)))
255
+ dt = time.perf_counter() - t0
256
+ log(f"{stage_name}: {dt:.2f}s")
257
+ stage_key = {
258
+ "Rewrite": "rewrite",
259
+ "Structural inference": "structural",
260
+ "Probe inference": "probe",
261
+ "Index selection": "selection",
262
+ }.get(stage_name)
263
+ if stage_key:
264
+ _record_timing(stage_key, dt)
265
+ return out
266
+ except FutureTimeoutError:
267
+ fut.cancel()
268
+ log(f"{stage_name}: timed out after {timeout_s:.0f}s; using fallback")
269
+ return fallback
270
+ except Exception as e:
271
+ log(f"{stage_name}: failed ({type(e).__name__}: {e}); using fallback")
272
+ return fallback
273
+
274
+ t_total0 = time.perf_counter()
275
  log("Start: received prompt")
276
  prompt_in = (user_prompt or "").strip()
277
  if not prompt_in:
278
+ return "Error: empty prompt", "", ""
279
 
280
  log("Input:")
281
  log(prompt_in)
282
  log("")
283
+ log(
284
+ "Runtime config: "
285
+ f"retrieval_global_k={retrieval_global_k} "
286
+ f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
287
+ f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
288
+ f"selection_mode={selection_mode} "
289
+ f"selection_chunk_size={selection_chunk_size} "
290
+ f"selection_per_phrase_k={selection_per_phrase_k}"
291
+ )
292
+ log("")
293
 
294
+ t0 = time.perf_counter()
295
  user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
296
+ dt = time.perf_counter()-t0
297
+ _record_timing("preprocess", dt)
298
+ log(f"Preprocess (user tag extraction): {dt:.2f}s")
299
  log("Heuristically extracted user tags:")
300
  if user_tags:
301
  log(", ".join(user_tags))
 
310
  fut_struct = ex.submit(llm_infer_structural_tags, prompt_in, log=log)
311
  fut_probe = ex.submit(llm_infer_probe_tags, prompt_in, log=log) if enable_probe_tags else None
312
 
313
+ rewritten = _future_with_timeout(
314
+ fut_rewrite, stage1_rewrite_timeout_s, "Rewrite", prompt_in
315
+ )
316
+ structural_tags = _future_with_timeout(
317
+ fut_struct, stage1_struct_timeout_s, "Structural inference", []
318
+ )
319
+ probe_tags = (
320
+ _future_with_timeout(fut_probe, stage1_probe_timeout_s, "Probe inference", [])
321
+ if fut_probe else []
322
+ )
323
 
324
  log("Rewrite:")
325
  log(rewritten if rewritten else "(empty)")
 
333
 
334
  log("Step 2: Prompt Squirrel retrieval (hidden)")
335
  try:
336
+ t0 = time.perf_counter()
337
  retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
338
  rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
339
  retrieval_result = psq_candidates_from_rewrite_phrases(
340
  rewrite_phrases=rewrite_phrases,
341
  allow_nsfw_tags=allow_nsfw_tags,
342
  context_tags=retrieval_context_tags,
343
+ global_k=max(1, retrieval_global_k),
344
+ per_phrase_k=max(1, retrieval_per_phrase_k),
345
+ per_phrase_final_k=max(1, retrieval_per_phrase_final_k),
346
  verbose=verbose_retrieval,
347
  )
348
  if isinstance(retrieval_result, tuple):
349
  candidates, phrase_reports = retrieval_result
350
  else:
351
  candidates, phrase_reports = retrieval_result, []
352
+ if selection_candidate_cap > 0 and len(candidates) > selection_candidate_cap:
353
+ candidates = candidates[:selection_candidate_cap]
354
+ log(f"Selection candidate cap applied: {selection_candidate_cap}")
355
+ dt = time.perf_counter()-t0
356
+ _record_timing("retrieval", dt)
357
+ log(f"Retrieval: {dt:.2f}s")
358
  log(f"Retrieved {len(candidates)} candidate tags")
359
  if verbose_retrieval:
360
  log(f"Total unique candidates: {len(candidates)}")
 
405
  structural_tags=structural_tags,
406
  probe_tags=probe_tags,
407
  )
408
+ with ThreadPoolExecutor(max_workers=1) as ex:
409
+ fut_sel = ex.submit(
410
+ llm_select_indices,
411
+ query_text=selection_query,
412
+ candidates=candidates,
413
+ max_pick=0,
414
+ log=log,
415
+ mode=selection_mode,
416
+ chunk_size=max(1, selection_chunk_size),
417
+ per_phrase_k=max(1, selection_per_phrase_k),
418
+ )
419
+ picked_indices = _future_with_timeout(
420
+ fut_sel, stage3_select_timeout_s, "Index selection", []
421
+ )
422
 
423
  selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
424
 
 
440
  log(" No probe tags inferred")
441
 
442
  log("Step 3c: Expand via tag implications")
443
+ t0 = time.perf_counter()
444
  tag_set = set(selected_tags)
445
  expanded, implied_only = expand_tags_via_implications(tag_set)
446
+ dt = time.perf_counter()-t0
447
+ _record_timing("implication_expansion", dt)
448
+ log(f"Implication expansion: {dt:.2f}s")
449
  if implied_only:
450
  selected_tags.extend(sorted(implied_only))
451
  log(f" Added {len(implied_only)} implied tags: {', '.join(sorted(implied_only))}")
 
453
  log(" No additional implied tags")
454
 
455
  log("Step 4: Compose final prompt")
456
+ t0 = time.perf_counter()
457
  final_prompt = compose_final_prompt(rewritten, selected_tags)
458
+ dt = time.perf_counter()-t0
459
+ _record_timing("prompt_composition", dt)
460
+ log(f"Prompt composition: {dt:.2f}s")
461
+
462
+ log("Step 5: Build ranked group/category display")
463
+ t0 = time.perf_counter()
464
+ seed_terms = []
465
+ seed_terms.extend(user_tags)
466
+ seed_terms.extend([p.strip() for p in (rewritten or "").split(",") if p.strip()])
467
+ seed_terms.extend(structural_tags or [])
468
+ seed_terms.extend(probe_tags or [])
469
+ seed_terms.extend(selected_tags)
470
+ seed_terms = list(dict.fromkeys(seed_terms))
471
+
472
+ groups_md = render_group_rankings_markdown(
473
+ seed_terms=seed_terms,
474
+ top_groups=max(1, int(display_top_groups)),
475
+ top_tags_per_group=max(1, int(display_top_tags_per_group)),
476
+ group_rank_top_k=max(1, int(display_rank_top_k)),
477
+ )
478
+ dt = time.perf_counter()-t0
479
+ _record_timing("group_display", dt)
480
+ log(f"Ranked group display: {dt:.2f}s")
481
 
482
+ total_dt = time.perf_counter()-t_total0
483
+ _emit_timing_summary(total_dt)
484
+ _append_timing_jsonl(total_dt)
485
  log("Done: final prompt ready")
486
+ return "\n".join(logs), final_prompt, groups_md
487
 
488
  except Exception as e:
489
  log(f"Error: {type(e).__name__}: {e}")
490
+ return "\n".join(logs), "", ""
491
 
492
 
493
 
 
500
  lines=1
501
  )
502
  with gr.Column(scale=1):
503
+ _mascot_pil = _load_mascot_image()
504
+ if _mascot_pil is not None:
505
+ mascot_img = gr.Image(
506
+ value=_mascot_pil,
507
+ show_label=False,
508
+ interactive=False,
509
+ height=220,
510
+ elem_id="mascot"
511
+ )
512
+ else:
513
+ mascot_img = gr.Markdown("`(mascot image unavailable)`")
514
  submit_button = gr.Button("Run", variant="primary")
515
 
516
  gr.Markdown(
 
536
  placeholder="Your optimized prompt will appear here."
537
  )
538
 
539
+ with gr.Accordion("Display Settings", open=False):
540
+ with gr.Row():
541
+ display_top_groups = gr.Number(
542
+ value=display_top_groups_default,
543
+ precision=0,
544
+ label="Rows (Top Groups/Categories)",
545
+ minimum=1,
546
+ )
547
+ display_top_tags_per_group = gr.Number(
548
+ value=display_top_tags_per_group_default,
549
+ precision=0,
550
+ label="Top Tags Shown Per Row",
551
+ minimum=1,
552
+ )
553
+ display_rank_top_k = gr.Number(
554
+ value=display_rank_top_k_default,
555
+ precision=0,
556
+ label="Top Tags Used for Row Ranking",
557
+ minimum=1,
558
+ )
559
+
560
+ group_rankings_md = gr.Markdown(
561
+ label="Ranked Group/Category Tag Suggestions",
562
+ value="",
563
+ )
564
+
565
  submit_button.click(
566
  rag_pipeline_ui,
567
+ inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
568
+ outputs=[console, final_prompt, group_rankings_md]
569
  )
570
 
571
  image_tags.submit(
572
  rag_pipeline_ui,
573
+ inputs=[image_tags, display_top_groups, display_top_tags_per_group, display_rank_top_k],
574
+ outputs=[console, final_prompt, group_rankings_md]
575
  )
576
 
577
  if __name__ == "__main__":
data/analysis/simplified_probe_tags.csv CHANGED
@@ -32,5 +32,5 @@ thick_thighs,body_shape_breasts,0,0.025000,0.001367,1,1,0.500000,1.000000,0.6666
32
  nude,clothing_state,0,0.057000,0.004049,1,3,0.000000,0.000000,0.000000,0.000000,0.001012,0,"support=3, f1=0.000, prec=0.000, rec=0.000"
33
  humanoid,body_type_presence,1,0.076000,0.003484,1,6,0.000000,0.000000,0.000000,0.000000,0.000871,0,"support=6, f1=0.000, prec=0.000, rec=0.000"
34
  bird,species_taxonomy,0,0.042000,0.001184,1,6,0.571429,0.666667,0.615385,0.615385,0.000842,1,"support=6, f1=0.615, prec=0.571, rec=0.667"
35
- bear,species_taxonomy,0,0.038000,0.001141,1,5,0.500000,0.800000,0.615385,0.615385,0.000812,1,"support=5, f1=0.615, prec=0.500, rec=0.800"
36
  <3,text_symbols,1,0.050000,0.000364,1,6,1.000000,0.500000,0.666667,0.666667,0.000273,1,"support=6, f1=0.667, prec=1.000, rec=0.500"
 
32
  nude,clothing_state,0,0.057000,0.004049,1,3,0.000000,0.000000,0.000000,0.000000,0.001012,0,"support=3, f1=0.000, prec=0.000, rec=0.000"
33
  humanoid,body_type_presence,1,0.076000,0.003484,1,6,0.000000,0.000000,0.000000,0.000000,0.000871,0,"support=6, f1=0.000, prec=0.000, rec=0.000"
34
  bird,species_taxonomy,0,0.042000,0.001184,1,6,0.571429,0.666667,0.615385,0.615385,0.000842,1,"support=6, f1=0.615, prec=0.571, rec=0.667"
35
+ bear,species_taxonomy,0,0.038000,0.001141,0,5,0.500000,0.800000,0.615385,0.615385,0.000812,0,"support=5, f1=0.615, prec=0.500, rec=0.800"
36
  <3,text_symbols,1,0.050000,0.000364,1,6,1.000000,0.500000,0.666667,0.666667,0.000273,1,"support=6, f1=0.667, prec=1.000, rec=0.500"
data/eval_results/k1_default_recheck_seed42_n10.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T07:10:22.047827", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 20, "n_selected": 29, "n_implied": 17, "n_structural": 4, "n_probe": 3, "ret_R": 0.2727, "P": 0.5172, "R": 0.6818, "F1": 0.5882, "leaf_P": 0.4167, "leaf_R": 0.3846, "leaf_F1": 0.4, "n_leaf_sel": 12, "n_leaf_gt": 13, "ret_P": 0.3, "sel_given_ret": 2.5, "over_sel": 1.32, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 7, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5172, "gen_R": 0.6818, "gen_F1": 0.5882, "missed": ["bass_guitar", "canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["bottomwear", "denim", "denim_clothing", "flowing_hair", "jeans", "looking_at_viewer", "pants", "pastel_background", "playing_guitar", "playing_music", "pose", "torn_bottomwear", "torn_jeans", "torn_pants"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bottomwear", "canid", "claws", "clothed", "clothing", "denim", "denim_clothing", "flowing_hair", "guitar", "hair", "jeans", "looking_at_viewer", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "pose", "solo", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants"], "stage3_selected": ["claws", "flowing_hair", "pastel_background", "playing_guitar", "pose", "spade_tail", "torn_jeans"], "stage3_selected_scores": {"claws": 0.5637, "pose": 0.5717, "spade_tail": 0.6167, "playing_guitar": 0.9311, "torn_jeans": 0.481, "flowing_hair": 0.5655, "pastel_background": 0.56}, "stage3_selected_ranks": {"claws": 10, "pose": 7, "spade_tail": 3, "playing_guitar": 2, "torn_jeans": 18, "flowing_hair": 9, "pastel_background": 12}, "stage3_selected_phrase_ranks": {"claws": 1, "pose": 1, "spade_tail": 1, "playing_guitar": 1, "torn_jeans": 1, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5655}, "jeans": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.56}, "playing_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9311}, "playing_music": {"source": "implied"}, "pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5717}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "unknown", "retrieval_score": 0.481}, "torn_pants": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["solo", "canid", "anthro"], "t1": 3.4, "t2": 3.66, "t3": 1.38, "t3s": 3.59, "t3p": 5.79, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 12, "n_selected": 17, "n_implied": 3, "n_structural": 4, "n_probe": 3, "ret_R": 0.75, "P": 0.1765, "R": 0.75, "F1": 0.2857, "leaf_P": 0.2143, "leaf_R": 0.75, "leaf_F1": 0.3333, "n_leaf_sel": 14, "n_leaf_gt": 4, "ret_P": 0.25, "sel_given_ret": 1.0, "over_sel": 4.25, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 10, "attempts_by_n_local": {"15": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1765, "gen_R": 0.75, "gen_F1": 0.2857, "missed": ["smile"], "extra": ["ambiguous_gender", "anthro", "bear", "big_eyes", "cartoon_character", "clothed", "clothing", "eyes", "floating", "mammal", "nose", "pink_mouth", "spots", "topless"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "bear", "big_eyes", "cartoon_character", "clothed", "clothing", "eyes", "floating", "mammal", "nose", "pink_mouth", "red_nose", "solo", "spots", "tan_body", "topless"], "stage3_selected": ["big_eyes", "cartoon_character", "eyes", "floating", "nose", "pink_mouth", "red_nose", "spots", "tan_body", "white_background"], "stage3_selected_scores": {"white_background": 0.6199, "tan_body": 0.667, "spots": 0.6295, "big_eyes": 0.6992, "red_nose": 0.752, "floating": 0.6502, "pink_mouth": 0.639, "nose": 0.8607, "cartoon_character": 0.5052, "eyes": 0.9251}, "stage3_selected_ranks": {"white_background": 9, "tan_body": 5, "spots": 8, "big_eyes": 4, "red_nose": 3, "floating": 6, "pink_mouth": 7, "nose": 2, "cartoon_character": 13, "eyes": 1}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 1, "spots": 1, "big_eyes": 1, "red_nose": 1, "floating": 1, "pink_mouth": 1, "nose": 1, "cartoon_character": 1, "eyes": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "structural"}, "bear": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6992}, "cartoon_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5052}, "clothed": {"source": "implied"}, "clothing": {"source": "implied"}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9251}, "floating": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6502}, "mammal": {"source": "implied"}, "nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8607}, "pink_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.639}, "spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6295}, "topless": {"source": "structural"}}, "structural": ["solo", "anthro", "ambiguous_gender", "topless"], "probe": ["solo", "simple_background", "bear"], "t1": 2.19, "t2": 1.18, "t3": 11.61, "t3s": 1.02, "t3p": 3.9, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=15 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 15, "n_selected": 15, "n_implied": 2, "n_structural": 4, "n_probe": 5, "ret_R": 0.3571, "P": 0.5333, "R": 0.5714, "F1": 0.5517, "leaf_P": 0.5455, "leaf_R": 0.6667, "leaf_F1": 0.6, "n_leaf_sel": 11, "n_leaf_gt": 9, "ret_P": 0.3333, "sel_given_ret": 1.6, "over_sel": 1.07, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 6, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5333, "gen_R": 0.5714, "gen_F1": 0.5517, "missed": ["lagomorph", "leporid", "mammal", "rabbit", "romantic", "romantic_couple"], "extra": ["<3", "coat", "holding_object", "holding_plushie", "looking_at_viewer", "relationship", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "holding_object", "holding_plushie", "looking_at_viewer", "plushie", "relationship", "teal_eyes", "topwear"], "stage3_selected": ["blue_eyes", "coat", "holding_plushie", "plushie", "relationship", "teal_eyes"], "stage3_selected_scores": {"blue_eyes": 0.6151, "coat": 0.6383, "plushie": 0.7455, "teal_eyes": 0.6283, "holding_plushie": 0.7793, "relationship": 0.6206}, "stage3_selected_ranks": {"blue_eyes": 9, "coat": 5, "plushie": 3, "teal_eyes": 6, "holding_plushie": 2, "relationship": 7}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "coat": 1, "plushie": 1, "teal_eyes": 1, "holding_plushie": 1, "relationship": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6383}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7793}, "looking_at_viewer": {"source": "structural"}, "relationship": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6206}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["duo", "clothing", "blush", "anthro", "<3"], "t1": 2.27, "t2": 1.48, "t3": 1.38, "t3s": 1.8, "t3p": 2.97, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 16, "n_selected": 31, "n_implied": 12, "n_structural": 4, "n_probe": 7, "ret_R": 0.44, "P": 0.6774, "R": 0.84, "F1": 0.75, "leaf_P": 0.5882, "leaf_R": 0.6667, "leaf_F1": 0.625, "n_leaf_sel": 17, "n_leaf_gt": 15, "ret_P": 0.6875, "sel_given_ret": 1.9091, "over_sel": 1.24, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6774, "gen_R": 0.84, "gen_F1": 0.75, "missed": ["canine", "fox", "looking_at_another", "standing"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blush", "felid", "looking_at_viewer", "open_mouth", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blush", "bottomwear", "canid", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "felid", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "rabbit", "shirt", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "claws", "crossed_arms", "facial_markings", "fur", "grey_background", "open_mouth", "overalls", "rabbit", "shirt", "white_shirt"], "stage3_selected_scores": {"fur": 0.6548, "open_mouth": 0.6344, "claws": 0.6317, "shirt": 0.7497, "rabbit": 0.6521, "grey_background": 0.6797, "facial_markings": 0.6956, "crossed_arms": 0.7298, "white_shirt": 0.8206, "overalls": 0.8782, "black_pants": 0.8338}, "stage3_selected_ranks": {"fur": 11, "open_mouth": 14, "claws": 15, "shirt": 6, "rabbit": 12, "grey_background": 10, "facial_markings": 8, "crossed_arms": 7, "white_shirt": 4, "overalls": 2, "black_pants": 3}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "shirt": 1, "rabbit": 1, "grey_background": 1, "facial_markings": 1, "crossed_arms": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8338}, "blush": {"source": "probe"}, "felid": {"source": "probe"}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6344}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8206}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "felid", "duo", "clothing", "canid", "blush", "anthro"], "t1": 5.92, "t2": 1.55, "t3": 4.86, "t3s": 4.42, "t3p": 7.2, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 17, "n_selected": 31, "n_implied": 6, "n_structural": 4, "n_probe": 5, "ret_R": 0.2308, "P": 0.2903, "R": 0.6923, "F1": 0.4091, "leaf_P": 0.16, "leaf_R": 0.6667, "leaf_F1": 0.2581, "n_leaf_sel": 25, "n_leaf_gt": 6, "ret_P": 0.1765, "sel_given_ret": 3.0, "over_sel": 2.38, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"17": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2903, "gen_R": 0.6923, "gen_F1": 0.4091, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["<3", "anthro", "bear", "bubble", "darkness", "duo", "face_mask", "felid", "figurine", "group", "light", "lying_on_ground", "note", "pear-shaped_figure", "power_lines", "solo", "speech_bubble", "standing_over", "texting", "underground", "unknown_species", "wide_hips"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["<3", "anthro", "bear", "bovid", "bubble", "caprine", "darkness", "duo", "face_mask", "felid", "figurine", "goat", "group", "human", "light", "lizard", "lying_on_ground", "mammal", "note", "pear-shaped_figure", "power_lines", "reptile", "scalie", "solo", "speech_bubble", "standing_over", "text", "texting", "underground", "unknown_species", "wide_hips"], "stage3_selected": ["bubble", "darkness", "face_mask", "figurine", "goat", "human", "light", "lizard", "lying_on_ground", "note", "pear-shaped_figure", "power_lines", "speech_bubble", "standing_over", "texting", "underground", "unknown_species"], "stage3_selected_scores": {"human": 0.669, "speech_bubble": 0.7584, "lizard": 0.839, "goat": 0.7768, "light": 0.7793, "unknown_species": 0.7697, "bubble": 0.7508, "pear-shaped_figure": 0.5657, "lying_on_ground": 0.7947, "face_mask": 0.5493, "darkness": 0.8328, "texting": 0.5661, "note": 0.7398, "underground": 0.5853, "figurine": 0.7007, "standing_over": 0.7647, "power_lines": 0.5072}, "stage3_selected_ranks": {"human": 12, "speech_bubble": 8, "lizard": 1, "goat": 5, "light": 4, "unknown_species": 6, "bubble": 9, "pear-shaped_figure": 15, "lying_on_ground": 3, "face_mask": 16, "darkness": 2, "texting": 14, "note": 10, "underground": 13, "figurine": 11, "standing_over": 7, "power_lines": 17}, "stage3_selected_phrase_ranks": {"human": 1, "speech_bubble": 1, "lizard": 1, "goat": 1, "light": 1, "unknown_species": 1, "bubble": 1, "pear-shaped_figure": 1, "lying_on_ground": 1, "face_mask": 1, "darkness": 1, "texting": 1, "note": 1, "underground": 1, "figurine": 1, "standing_over": 1, "power_lines": 1}, "extra_evidence": {"<3": {"source": "probe"}, "anthro": {"source": "probe"}, "bear": {"source": "probe"}, "bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7508}, "darkness": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8328}, "duo": {"source": "structural"}, "face_mask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5493}, "felid": {"source": "probe"}, "figurine": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7007}, "group": {"source": "structural"}, "light": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7793}, "lying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7947}, "note": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7398}, "pear-shaped_figure": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5657}, "power_lines": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5072}, "solo": {"source": "structural"}, "speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7584}, "standing_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7647}, "texting": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5661}, "underground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5853}, "unknown_species": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7697}, "wide_hips": {"source": "implied"}}, "structural": ["solo", "duo", "group", "text"], "probe": ["group", "felid", "bear", "anthro", "<3"], "t1": 3.19, "t2": 1.51, "t3": 4.0, "t3s": 0.83, "t3p": 1.94, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=17 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 24, "n_selected": 31, "n_implied": 3, "n_structural": 3, "n_probe": 6, "ret_R": 0.6, "P": 0.3226, "R": 0.6667, "F1": 0.4348, "leaf_P": 0.3077, "leaf_R": 0.6667, "leaf_F1": 0.4211, "n_leaf_sel": 26, "n_leaf_gt": 12, "ret_P": 0.375, "sel_given_ret": 1.1111, "over_sel": 2.07, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 23, "attempts_by_n_local": {"25": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3226, "gen_R": 0.6667, "gen_F1": 0.4348, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture"], "extra": ["annoyed_expression", "anthro", "atmosphere", "bedroom", "blush", "distracting_watermark", "eyes", "felid", "font", "humanoid", "mammal", "membrane_(anatomy)", "palette", "playful", "purple_membrane", "resting", "romantic", "romantic_ambiance", "stats", "walking", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "atmosphere", "bedroom", "blonde_hair", "blue_eyes", "blush", "distracting_watermark", "duo", "eyes", "felid", "font", "green_eyes", "hair", "humanoid", "lying", "makeup", "mammal", "membrane_(anatomy)", "palette", "playful", "purple_hair", "purple_membrane", "resting", "romantic", "romantic_ambiance", "sleeping", "stats", "text", "walking", "watermark"], "stage3_selected": ["annoyed_expression", "atmosphere", "bedroom", "blonde_hair", "blue_eyes", "distracting_watermark", "eyes", "font", "green_eyes", "hair", "lying", "makeup", "palette", "playful", "purple_hair", "purple_membrane", "resting", "romantic_ambiance", "sleeping", "stats", "text", "walking", "watermark"], "stage3_selected_scores": {"hair": 0.6041, "text": 0.6017, "blue_eyes": 0.6023, "lying": 0.4504, "green_eyes": 0.5999, "blonde_hair": 0.5995, "purple_hair": 0.5647, "makeup": 0.5972, "watermark": 0.6051, "bedroom": 0.491, "sleeping": 0.6037, "walking": 0.3595, "romantic_ambiance": 0.4811, "distracting_watermark": 0.4792, "playful": 0.4474, "resting": 0.5152, "annoyed_expression": 0.7259, "stats": 0.5067, "palette": 0.669, "purple_membrane": 0.5791, "atmosphere": 0.5048, "font": 0.5305, "eyes": 0.895}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "lying": 22, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "watermark": 4, "bedroom": 19, "sleeping": 6, "walking": 25, "romantic_ambiance": 20, "distracting_watermark": 21, "playful": 23, "resting": 16, "annoyed_expression": 2, "stats": 17, "palette": 3, "purple_membrane": 12, "atmosphere": 18, "font": 15, "eyes": 1}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "lying": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "watermark": 1, "bedroom": 1, "sleeping": 1, "walking": 1, "romantic_ambiance": 1, "distracting_watermark": 1, "playful": 1, "resting": 1, "annoyed_expression": 1, "stats": 1, "palette": 1, "purple_membrane": 1, "atmosphere": 1, "font": 1, "eyes": 1}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7259}, "anthro": {"source": "probe"}, "atmosphere": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5048}, "bedroom": {"source": "stage3", "why": "unknown", "retrieval_score": 0.491}, "blush": {"source": "probe"}, "distracting_watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4792}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.895}, "felid": {"source": "probe"}, "font": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5305}, "humanoid": {"source": "structural"}, "mammal": {"source": "implied"}, "membrane_(anatomy)": {"source": "implied"}, "palette": {"source": "stage3", "why": "unknown", "retrieval_score": 0.669}, "playful": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4474}, "purple_membrane": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5791}, "resting": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5152}, "romantic": {"source": "implied"}, "romantic_ambiance": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4811}, "stats": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5067}, "walking": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3595}, "watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6051}}, "structural": ["duo", "humanoid", "text"], "probe": ["text", "simple_background", "felid", "duo", "blush", "anthro"], "t1": 2.71, "t2": 2.24, "t3": 7.99, "t3s": 3.84, "t3p": 5.53, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=25 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 20, "n_selected": 33, "n_implied": 9, "n_structural": 4, "n_probe": 4, "ret_R": 0.5455, "P": 0.3333, "R": 1.0, "F1": 0.5, "leaf_P": 0.3333, "leaf_R": 0.8571, "leaf_F1": 0.48, "n_leaf_sel": 18, "n_leaf_gt": 7, "ret_P": 0.3, "sel_given_ret": 1.8333, "over_sel": 3.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 21, "attempts_by_n_local": {"23": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3333, "gen_R": 1.0, "gen_F1": 0.5, "missed": [], "extra": ["action_pose", "ambiguous_gender", "animal_humanoid", "animated_png", "anthro", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "eyes", "half-length_portrait", "humanoid", "mammal_humanoid", "nose", "pink_stripes", "pink_tail", "portrait", "pose", "stripes", "tail", "topless"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["action_pose", "ambiguous_gender", "animal_humanoid", "animated_png", "anthro", "blue_eyes", "blue_nose", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "eyes", "fur", "half-length_portrait", "humanoid", "mammal", "mammal_humanoid", "nose", "open_mouth", "pink_stripes", "pink_tail", "portrait", "pose", "purple_body", "solo", "stripes", "tail", "topless", "white_body", "white_fur"], "stage3_selected": ["action_pose", "animated_png", "blue_eyes", "blue_nose", "canine_humanoid", "curved_tail", "eyes", "fur", "half-length_portrait", "humanoid", "invalid_background", "nose", "open_mouth", "pink_stripes", "pink_tail", "pose", "purple_body", "simple_background", "stripes", "tail", "white_fur"], "stage3_selected_scores": {"fur": 0.5679, "simple_background": 0.5795, "open_mouth": 0.5861, "tail": 0.5909, "blue_eyes": 0.5832, "white_fur": 0.5785, "humanoid": 0.6719, "stripes": 0.5793, "pose": 0.6, "purple_body": 0.5484, "canine_humanoid": 0.9129, "blue_nose": 0.5927, "half-length_portrait": 0.464, "pink_tail": 0.5172, "action_pose": 0.5954, "pink_stripes": 0.5455, "curved_tail": 0.5963, "nose": 0.7036, "invalid_background": 0.5524, "eyes": 0.7512, "animated_png": 0.4463}, "stage3_selected_ranks": {"fur": 15, "simple_background": 12, "open_mouth": 10, "tail": 9, "blue_eyes": 11, "white_fur": 14, "humanoid": 4, "stripes": 13, "pose": 5, "purple_body": 17, "canine_humanoid": 1, "blue_nose": 8, "half-length_portrait": 22, "pink_tail": 20, "action_pose": 7, "pink_stripes": 18, "curved_tail": 6, "nose": 3, "invalid_background": 16, "eyes": 2, "animated_png": 23}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "open_mouth": 1, "tail": 1, "blue_eyes": 1, "white_fur": 1, "humanoid": 1, "stripes": 1, "pose": 1, "purple_body": 1, "canine_humanoid": 1, "blue_nose": 1, "half-length_portrait": 1, "pink_tail": 1, "action_pose": 1, "pink_stripes": 1, "curved_tail": 1, "nose": 1, "invalid_background": 1, "eyes": 1, "animated_png": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5954}, "ambiguous_gender": {"source": "structural"}, "animal_humanoid": {"source": "implied"}, "animated_png": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4463}, "anthro": {"source": "structural"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9129}, "clothed": {"source": "implied"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5963}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7512}, "half-length_portrait": {"source": "stage3", "why": "unknown", "retrieval_score": 0.464}, "humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6719}, "mammal_humanoid": {"source": "implied"}, "nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7036}, "pink_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5455}, "pink_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5172}, "portrait": {"source": "implied"}, "pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6}, "stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5793}, "tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5909}, "topless": {"source": "structural"}}, "structural": ["solo", "anthro", "ambiguous_gender", "topless"], "probe": ["solo", "simple_background", "canid", "anthro"], "t1": 2.73, "t2": 1.82, "t3": 6.54, "t3s": 1.47, "t3p": 1.91, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=23 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=6"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 19, "n_selected": 29, "n_implied": 11, "n_structural": 5, "n_probe": 5, "ret_R": 0.1818, "P": 0.4828, "R": 0.6364, "F1": 0.549, "leaf_P": 0.25, "leaf_R": 0.3333, "leaf_F1": 0.2857, "n_leaf_sel": 16, "n_leaf_gt": 12, "ret_P": 0.2105, "sel_given_ret": 3.5, "over_sel": 1.32, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 10, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4828, "gen_R": 0.6364, "gen_F1": 0.549, "missed": ["chest_tuft", "countershading", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "topless", "tuft"], "extra": ["bear", "countershade_body", "fluffy_fur", "forest", "forest_background", "gesture", "looking_at_viewer", "nature", "nature_background", "plant", "raised_hand", "striped_body", "striped_fur", "tree", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "bear", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "felid", "fluffy_fur", "forest", "forest_background", "fur", "gesture", "looking_at_viewer", "male", "mammal", "nature", "nature_background", "pantherine", "plant", "raised_hand", "shorts", "solo", "striped_body", "striped_fur", "stripes", "tiger", "tree", "white_chest"], "stage3_selected": ["blue_eyes", "countershade_body", "fluffy_fur", "forest_background", "gesture", "raised_hand", "shorts", "striped_fur", "tiger", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.6084, "shorts": 0.6188, "tiger": 0.6311, "gesture": 0.6237, "striped_fur": 0.6808, "raised_hand": 0.7178, "forest_background": 0.6326, "white_chest": 0.9238, "countershade_body": 0.8643, "fluffy_fur": 0.6859}, "stage3_selected_ranks": {"blue_eyes": 17, "shorts": 16, "tiger": 13, "gesture": 15, "striped_fur": 8, "raised_hand": 6, "forest_background": 12, "white_chest": 2, "countershade_body": 3, "fluffy_fur": 7}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "shorts": 1, "tiger": 1, "gesture": 1, "striped_fur": 1, "raised_hand": 1, "forest_background": 1, "white_chest": 1, "countershade_body": 1, "fluffy_fur": 1}, "extra_evidence": {"bear": {"source": "probe"}, "countershade_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8643}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6859}, "forest": {"source": "implied"}, "forest_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6326}, "gesture": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6237}, "looking_at_viewer": {"source": "structural"}, "nature": {"source": "implied"}, "nature_background": {"source": "implied"}, "plant": {"source": "implied"}, "raised_hand": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7178}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6808}, "tree": {"source": "implied"}, "white_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9238}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["solo", "felid", "clothing", "bear", "anthro"], "t1": 2.03, "t2": 1.98, "t3": 4.75, "t3s": 0.62, "t3p": 1.98, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 15, "n_selected": 25, "n_implied": 5, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.4, "R": 0.8333, "F1": 0.5405, "leaf_P": 0.4, "leaf_R": 0.6667, "leaf_F1": 0.5, "n_leaf_sel": 15, "n_leaf_gt": 9, "ret_P": 0.2, "sel_given_ret": 3.3333, "over_sel": 2.08, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4, "gen_R": 0.8333, "gen_F1": 0.5405, "missed": ["alpha_channel", "fingers"], "extra": ["black_body", "black_fur", "business_attire", "formal", "holding_mug", "holding_object", "mug", "necktie", "shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "black_body", "black_fur", "business_attire", "clothed", "clothing", "felid", "feline", "formal", "fur", "hair", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "shirt", "solo", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "business_attire", "feline", "formal", "fur", "hair", "holding_mug", "invalid_background", "mug", "necktie", "shirt", "simple_background", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"hair": 0.6803, "fur": 0.7146, "simple_background": 0.6978, "feline": 0.7062, "shirt": 0.7998, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8403, "mug": 0.8841, "holding_mug": 0.916, "formal": 0.5993, "business_attire": 0.5558, "teal_shirt": 0.7474, "white_necktie": 0.6418, "invalid_background": 0.6495}, "stage3_selected_ranks": {"hair": 12, "fur": 9, "simple_background": 11, "feline": 10, "shirt": 5, "black_fur": 8, "necktie": 7, "vest": 3, "mug": 2, "holding_mug": 1, "formal": 16, "business_attire": 18, "teal_shirt": 6, "white_necktie": 14, "invalid_background": 13}, "stage3_selected_phrase_ranks": {"hair": 1, "fur": 1, "simple_background": 1, "feline": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "vest": 1, "mug": 1, "holding_mug": 1, "formal": 1, "business_attire": 1, "teal_shirt": 1, "white_necktie": 1, "invalid_background": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7183}, "business_attire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5558}, "formal": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5993}, "holding_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7314}, "shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7998}, "teal_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8403}, "white_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["text", "solo", "felid", "clothing", "anthro"], "t1": 2.8, "t2": 1.62, "t3": 0.98, "t3s": 0.7, "t3p": 1.22, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 18, "n_selected": 26, "n_implied": 5, "n_structural": 7, "n_probe": 5, "ret_R": 0.5, "P": 0.4615, "R": 0.8571, "F1": 0.6, "leaf_P": 0.3333, "leaf_R": 0.6, "leaf_F1": 0.4286, "n_leaf_sel": 18, "n_leaf_gt": 10, "ret_P": 0.3889, "sel_given_ret": 1.7143, "over_sel": 1.86, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4615, "gen_R": 0.8571, "gen_F1": 0.6, "missed": ["fur", "human"], "extra": ["anthro", "bottomwear", "cheeky", "duo", "feral", "grin", "laugh", "loincloth", "raised_arm", "raised_arms", "smile", "topless", "trio", "wide_grin"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "bottomwear", "cheeky", "clothed", "clothing", "dancing", "duo", "feral", "grin", "group", "hair", "haplorhine", "laugh", "loincloth", "looking_at_viewer", "male", "mammal", "primate", "raised_arm", "raised_arms", "smile", "topless", "trio", "wide_grin"], "stage3_selected": ["ape", "bear", "cheeky", "dancing", "grin", "hair", "laugh", "loincloth", "looking_at_viewer", "male", "primate", "raised_arm", "raised_arms", "simple_background", "wide_grin"], "stage3_selected_scores": {"male": 0.5604, "hair": 0.5445, "simple_background": 0.5491, "looking_at_viewer": 0.5475, "bear": 0.5735, "grin": 0.5653, "raised_arm": 0.421, "primate": 0.8905, "loincloth": 0.5685, "dancing": 0.5568, "laugh": 0.5259, "ape": 0.9767, "raised_arms": 0.5445, "cheeky": 0.3903, "wide_grin": 0.5267}, "stage3_selected_ranks": {"male": 6, "hair": 11, "simple_background": 8, "looking_at_viewer": 9, "bear": 3, "grin": 5, "raised_arm": 18, "primate": 2, "loincloth": 4, "dancing": 7, "laugh": 13, "ape": 1, "raised_arms": 10, "cheeky": 20, "wide_grin": 12}, "stage3_selected_phrase_ranks": {"male": 1, "hair": 1, "simple_background": 1, "looking_at_viewer": 1, "bear": 1, "grin": 1, "raised_arm": 1, "primate": 1, "loincloth": 1, "dancing": 1, "laugh": 1, "ape": 1, "raised_arms": 1, "cheeky": 1, "wide_grin": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "bottomwear": {"source": "implied"}, "cheeky": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3903}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5653}, "laugh": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5259}, "loincloth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5685}, "raised_arm": {"source": "stage3", "why": "unknown", "retrieval_score": 0.421}, "raised_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5445}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5267}}, "structural": ["trio", "anthro", "feral", "male", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "group", "duo", "bear", "anthro"], "t1": 2.25, "t2": 1.84, "t3": 4.03, "t3s": 1.91, "t3p": 2.25, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/k_sweep_explicit_no_why_seed42_k1.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T05:54:21.251706", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 20, "n_selected": 36, "n_implied": 18, "n_structural": 4, "n_probe": 5, "ret_R": 0.2727, "P": 0.4444, "R": 0.7273, "F1": 0.5517, "leaf_P": 0.3333, "leaf_R": 0.3846, "leaf_F1": 0.3571, "n_leaf_sel": 15, "n_leaf_gt": 13, "ret_P": 0.3, "sel_given_ret": 2.6667, "over_sel": 1.64, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4444, "gen_R": 0.7273, "gen_F1": 0.5517, "missed": ["bass_guitar", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["action_pose", "atmosphere", "bear", "bottomwear", "canis", "denim", "denim_clothing", "domestic_dog", "flowing_hair", "jeans", "looking_at_viewer", "pants", "pastel_background", "playing_guitar", "playing_music", "pose", "torn_bottomwear", "torn_jeans", "torn_pants", "unknown_species"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["action_pose", "anthro", "atmosphere", "bear", "bottomwear", "canid", "canine", "canis", "claws", "clothed", "clothing", "denim", "denim_clothing", "domestic_dog", "flowing_hair", "guitar", "hair", "jeans", "looking_at_viewer", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "pose", "solo", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "unknown_species"], "stage3_selected": ["action_pose", "atmosphere", "claws", "domestic_dog", "flowing_hair", "guitar", "pastel_background", "playing_guitar", "spade_tail", "torn_jeans", "unknown_species"], "stage3_selected_scores": {"claws": 0.5694, "domestic_dog": 0.5598, "unknown_species": 0.5792, "spade_tail": 0.6166, "guitar": 0.9627, "action_pose": 0.5829, "playing_guitar": 0.9312, "torn_jeans": 0.4829, "flowing_hair": 0.5661, "atmosphere": 0.5022, "pastel_background": 0.5696}, "stage3_selected_ranks": {"claws": 10, "domestic_dog": 14, "unknown_species": 6, "spade_tail": 3, "guitar": 1, "action_pose": 5, "playing_guitar": 2, "torn_jeans": 18, "flowing_hair": 12, "atmosphere": 17, "pastel_background": 9}, "stage3_selected_phrase_ranks": {"claws": 1, "domestic_dog": 1, "unknown_species": 1, "spade_tail": 1, "guitar": 1, "action_pose": 1, "playing_guitar": 1, "torn_jeans": 1, "flowing_hair": 1, "atmosphere": 1, "pastel_background": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5829}, "atmosphere": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5022}, "bear": {"source": "probe"}, "bottomwear": {"source": "implied"}, "canis": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "domestic_dog": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5598}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5661}, "jeans": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5696}, "playing_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9312}, "playing_music": {"source": "implied"}, "pose": {"source": "implied"}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4829}, "torn_pants": {"source": "implied"}, "unknown_species": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5792}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["solo", "clothing", "canid", "bear", "anthro"], "t1": 2.61, "t2": 3.15, "t3": 3.19, "t3s": 3.3, "t3p": 3.23, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 11, "n_selected": 12, "n_implied": 0, "n_structural": 4, "n_probe": 4, "ret_R": 0.75, "P": 0.25, "R": 0.75, "F1": 0.375, "leaf_P": 0.25, "leaf_R": 0.75, "leaf_F1": 0.375, "n_leaf_sel": 12, "n_leaf_gt": 4, "ret_P": 0.2727, "sel_given_ret": 1.0, "over_sel": 3.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 7, "attempts_by_n_local": {"14": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.25, "gen_R": 0.75, "gen_F1": 0.375, "missed": ["smile"], "extra": ["<3", "ambiguous_gender", "anthro", "big_eyes", "cartoon", "feral", "floating", "nude", "spots"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["<3", "ambiguous_gender", "anthro", "big_eyes", "cartoon", "feral", "floating", "nude", "red_nose", "solo", "spots", "tan_body"], "stage3_selected": ["big_eyes", "cartoon", "floating", "red_nose", "spots", "tan_body", "white_background"], "stage3_selected_scores": {"white_background": 0.6138, "tan_body": 0.6627, "spots": 0.6272, "big_eyes": 0.696, "red_nose": 0.7501, "floating": 0.6519, "cartoon": 0.5003}, "stage3_selected_ranks": {"white_background": 9, "tan_body": 5, "spots": 8, "big_eyes": 4, "red_nose": 3, "floating": 6, "cartoon": 13}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 1, "spots": 1, "big_eyes": 1, "red_nose": 1, "floating": 1, "cartoon": 1}, "extra_evidence": {"<3": {"source": "probe"}, "ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.696}, "cartoon": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5003}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6519}, "nude": {"source": "structural"}, "spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6272}}, "structural": ["solo", "feral", "ambiguous_gender", "nude"], "probe": ["solo", "simple_background", "anthro", "<3"], "t1": 4.56, "t2": 1.12, "t3": 2.0, "t3s": 1.48, "t3p": 2.52, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=14 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 13, "n_selected": 16, "n_implied": 5, "n_structural": 4, "n_probe": 4, "ret_R": 0.2857, "P": 0.6875, "R": 0.7857, "F1": 0.7333, "leaf_P": 0.6364, "leaf_R": 0.7778, "leaf_F1": 0.7, "n_leaf_sel": 11, "n_leaf_gt": 9, "ret_P": 0.3077, "sel_given_ret": 2.75, "over_sel": 1.14, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 5, "attempts_by_n_local": {"14": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6875, "gen_R": 0.7857, "gen_F1": 0.7333, "missed": ["blue_eyes", "romantic", "romantic_couple"], "extra": ["<3", "coat", "looking_at_viewer", "round_eyes", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blush", "clothed", "clothing", "coat", "duo", "lagomorph", "leporid", "looking_at_viewer", "mammal", "plushie", "rabbit", "round_eyes", "teal_eyes", "topwear"], "stage3_selected": ["coat", "plushie", "rabbit", "round_eyes", "teal_eyes"], "stage3_selected_scores": {"rabbit": 0.5842, "coat": 0.6315, "plushie": 0.6566, "teal_eyes": 0.6344, "round_eyes": 0.4982}, "stage3_selected_ranks": {"rabbit": 8, "coat": 5, "plushie": 3, "teal_eyes": 4, "round_eyes": 14}, "stage3_selected_phrase_ranks": {"rabbit": 1, "coat": 1, "plushie": 1, "teal_eyes": 1, "round_eyes": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6315}, "looking_at_viewer": {"source": "structural"}, "round_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4982}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["duo", "blush", "anthro", "<3"], "t1": 3.2, "t2": 1.43, "t3": 2.44, "t3s": 1.9, "t3p": 3.27, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=14 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 17, "n_selected": 30, "n_implied": 11, "n_structural": 4, "n_probe": 5, "ret_R": 0.48, "P": 0.7, "R": 0.84, "F1": 0.7636, "leaf_P": 0.6875, "leaf_R": 0.7333, "leaf_F1": 0.7097, "n_leaf_sel": 16, "n_leaf_gt": 15, "ret_P": 0.7059, "sel_given_ret": 1.75, "over_sel": 1.2, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7, "gen_R": 0.84, "gen_F1": 0.7636, "missed": ["canid", "canine", "fox", "looking_at_another"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "looking_at_viewer", "open_mouth", "text", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "bottomwear", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "rabbit", "shirt", "standing", "text", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "claws", "crossed_arms", "facial_markings", "fur", "grey_background", "open_mouth", "overalls", "pants", "rabbit", "shirt", "standing", "white_shirt"], "stage3_selected_scores": {"fur": 0.6532, "open_mouth": 0.6331, "claws": 0.6304, "standing": 0.6879, "shirt": 0.7484, "rabbit": 0.6511, "pants": 0.759, "grey_background": 0.6785, "facial_markings": 0.6946, "crossed_arms": 0.7286, "white_shirt": 0.8198, "overalls": 0.8776, "black_pants": 0.8331}, "stage3_selected_ranks": {"fur": 12, "open_mouth": 15, "claws": 16, "standing": 9, "shirt": 6, "rabbit": 13, "pants": 5, "grey_background": 11, "facial_markings": 8, "crossed_arms": 7, "white_shirt": 4, "overalls": 2, "black_pants": 3}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "standing": 1, "shirt": 1, "rabbit": 1, "pants": 1, "grey_background": 1, "facial_markings": 1, "crossed_arms": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8331}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6331}, "text": {"source": "probe"}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8198}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["text", "simple_background", "duo", "clothing", "anthro"], "t1": 1.89, "t2": 1.57, "t3": 2.15, "t3s": 1.02, "t3p": 0.8, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 22, "n_selected": 36, "n_implied": 7, "n_structural": 6, "n_probe": 5, "ret_R": 0.2308, "P": 0.25, "R": 0.6923, "F1": 0.3673, "leaf_P": 0.1429, "leaf_R": 0.6667, "leaf_F1": 0.2353, "n_leaf_sel": 28, "n_leaf_gt": 6, "ret_P": 0.1364, "sel_given_ret": 3.0, "over_sel": 2.77, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 20, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 1, "char_F1": 0.0, "gen_P": 0.2571, "gen_R": 0.6923, "gen_F1": 0.375, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["<3", "anthro", "bear", "bubble", "cjk_character", "clothed", "clothing", "darkness", "empty_speech_bubble", "epaulet", "felid", "flask", "group", "intersex", "light", "lying_on_ground", "model_sheet", "solo", "speech_bubble", "standing", "standing_over", "taur", "topwear", "unknown_species", "unnamed_character", "vest", "waist"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["<3", "anthro", "bear", "bovid", "bubble", "caprine", "cjk_character", "clothed", "clothing", "darkness", "empty_speech_bubble", "epaulet", "felid", "flask", "goat", "group", "human", "intersex", "light", "lizard", "lying_on_ground", "mammal", "model_sheet", "reptile", "scalie", "solo", "speech_bubble", "standing", "standing_over", "taur", "text", "topwear", "unknown_species", "unnamed_character", "vest", "waist"], "stage3_selected": ["bubble", "cjk_character", "darkness", "empty_speech_bubble", "epaulet", "flask", "goat", "group", "human", "light", "lizard", "lying_on_ground", "model_sheet", "speech_bubble", "standing", "standing_over", "unknown_species", "unnamed_character", "vest", "waist"], "stage3_selected_scores": {"group": 0.4649, "standing": 0.579, "human": 0.5558, "speech_bubble": 0.567, "lizard": 0.5896, "goat": 0.5748, "light": 0.5716, "model_sheet": 0.4033, "unknown_species": 0.5842, "vest": 0.39, "bubble": 0.5665, "lying_on_ground": 0.583, "darkness": 0.5867, "flask": 0.3707, "standing_over": 0.4257, "unnamed_character": 0.4123, "waist": 0.7395, "empty_speech_bubble": 0.39, "epaulet": 0.3917, "cjk_character": 0.4178}, "stage3_selected_ranks": {"group": 12, "standing": 6, "human": 11, "speech_bubble": 9, "lizard": 2, "goat": 7, "light": 8, "model_sheet": 16, "unknown_species": 4, "vest": 18, "bubble": 10, "lying_on_ground": 5, "darkness": 3, "flask": 20, "standing_over": 13, "unnamed_character": 15, "waist": 1, "empty_speech_bubble": 19, "epaulet": 17, "cjk_character": 14}, "stage3_selected_phrase_ranks": {"group": 1, "standing": 1, "human": 1, "speech_bubble": 1, "lizard": 1, "goat": 1, "light": 1, "model_sheet": 1, "unknown_species": 1, "vest": 1, "bubble": 1, "lying_on_ground": 1, "darkness": 1, "flask": 1, "standing_over": 1, "unnamed_character": 1, "waist": 1, "empty_speech_bubble": 1, "epaulet": 1, "cjk_character": 1}, "extra_evidence": {"<3": {"source": "probe"}, "anthro": {"source": "structural"}, "bear": {"source": "probe"}, "bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5665}, "cjk_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4178}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "darkness": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5867}, "empty_speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.39}, "epaulet": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3917}, "felid": {"source": "probe"}, "flask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3707}, "group": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4649}, "intersex": {"source": "structural"}, "light": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5716}, "lying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.583}, "model_sheet": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4033}, "solo": {"source": "structural"}, "speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.567}, "standing": {"source": "stage3", "why": "unknown", "retrieval_score": 0.579}, "standing_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4257}, "taur": {"source": "structural"}, "topwear": {"source": "implied"}, "unknown_species": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5842}, "unnamed_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4123}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.39}, "waist": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7395}}, "structural": ["solo", "anthro", "taur", "intersex", "clothed", "text"], "probe": ["group", "felid", "bear", "anthro", "<3"], "t1": 1.37, "t2": 2.0, "t3": 2.92, "t3s": 1.37, "t3p": 1.37, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=1"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 22, "n_selected": 27, "n_implied": 3, "n_structural": 3, "n_probe": 1, "ret_R": 0.5333, "P": 0.3333, "R": 0.6, "F1": 0.4286, "leaf_P": 0.3182, "leaf_R": 0.5833, "leaf_F1": 0.4118, "n_leaf_sel": 22, "n_leaf_gt": 12, "ret_P": 0.3636, "sel_given_ret": 1.125, "over_sel": 1.8, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 22, "attempts_by_n_local": {"23": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3333, "gen_R": 0.6, "gen_F1": 0.4286, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture", "lying"], "extra": ["bedroom", "distracting_watermark", "eyes", "felid", "font", "hand_on_own_chest", "humanoid", "mammal", "membrane_(anatomy)", "palette", "playful", "purple_membrane", "resting", "romantic", "romantic_ambiance", "sleepover", "stats", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["bedroom", "blonde_hair", "blue_eyes", "distracting_watermark", "duo", "eyes", "felid", "font", "green_eyes", "hair", "hand_on_own_chest", "humanoid", "makeup", "mammal", "membrane_(anatomy)", "palette", "playful", "purple_hair", "purple_membrane", "resting", "romantic", "romantic_ambiance", "sleeping", "sleepover", "stats", "text", "watermark"], "stage3_selected": ["bedroom", "blonde_hair", "blue_eyes", "distracting_watermark", "eyes", "font", "green_eyes", "hair", "hand_on_own_chest", "invalid_tag", "makeup", "palette", "playful", "purple_hair", "purple_membrane", "resting", "romantic_ambiance", "sleeping", "sleepover", "stats", "text", "watermark"], "stage3_selected_scores": {"hair": 0.6037, "text": 0.6013, "blue_eyes": 0.6019, "green_eyes": 0.5995, "blonde_hair": 0.5991, "purple_hair": 0.5644, "makeup": 0.5969, "watermark": 0.6047, "bedroom": 0.4906, "sleeping": 0.6033, "romantic_ambiance": 0.4808, "distracting_watermark": 0.4788, "playful": 0.447, "resting": 0.5149, "invalid_tag": 0.5594, "stats": 0.5066, "palette": 0.6688, "sleepover": 0.3804, "purple_membrane": 0.579, "hand_on_own_chest": 0.5253, "font": 0.5303, "eyes": 0.895}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "watermark": 4, "bedroom": 19, "sleeping": 6, "romantic_ambiance": 20, "distracting_watermark": 21, "playful": 22, "resting": 17, "invalid_tag": 14, "stats": 18, "palette": 3, "sleepover": 23, "purple_membrane": 12, "hand_on_own_chest": 16, "font": 15, "eyes": 1}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "watermark": 1, "bedroom": 1, "sleeping": 1, "romantic_ambiance": 1, "distracting_watermark": 1, "playful": 1, "resting": 1, "invalid_tag": 1, "stats": 1, "palette": 1, "sleepover": 1, "purple_membrane": 1, "hand_on_own_chest": 1, "font": 1, "eyes": 1}, "extra_evidence": {"bedroom": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4906}, "distracting_watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4788}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.895}, "felid": {"source": "probe"}, "font": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5303}, "hand_on_own_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5253}, "humanoid": {"source": "structural"}, "mammal": {"source": "implied"}, "membrane_(anatomy)": {"source": "implied"}, "palette": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6688}, "playful": {"source": "stage3", "why": "unknown", "retrieval_score": 0.447}, "purple_membrane": {"source": "stage3", "why": "unknown", "retrieval_score": 0.579}, "resting": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5149}, "romantic": {"source": "implied"}, "romantic_ambiance": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4808}, "sleepover": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3804}, "stats": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5066}, "watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6047}}, "structural": ["duo", "humanoid", "text"], "probe": ["felid"], "t1": 1.83, "t2": 2.0, "t3": 1.38, "t3s": 3.32, "t3p": 1.84, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=23 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 21, "n_selected": 19, "n_implied": 7, "n_structural": 4, "n_probe": 5, "ret_R": 0.5455, "P": 0.3158, "R": 0.5455, "F1": 0.4, "leaf_P": 0.2, "leaf_R": 0.2857, "leaf_F1": 0.2353, "n_leaf_sel": 10, "n_leaf_gt": 7, "ret_P": 0.2857, "sel_given_ret": 1.0, "over_sel": 1.73, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 7, "attempts_by_n_local": {"24": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3158, "gen_R": 0.5455, "gen_F1": 0.4, "missed": ["blue_eyes", "blue_nose", "open_mouth", "white_body", "white_fur"], "extra": ["animal_humanoid", "anthro", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "humanoid", "male", "mammal_humanoid", "pink_stripes", "stripes", "tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anthro", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "fur", "humanoid", "male", "mammal", "mammal_humanoid", "pink_stripes", "purple_body", "solo", "stripes", "tail"], "stage3_selected": ["canine_humanoid", "curved_tail", "fur", "pink_stripes", "purple_body", "simple_background", "tail"], "stage3_selected_scores": {"fur": 0.5666, "simple_background": 0.5782, "tail": 0.5897, "purple_body": 0.5476, "canine_humanoid": 0.9128, "pink_stripes": 0.5444, "curved_tail": 0.5958}, "stage3_selected_ranks": {"fur": 16, "simple_background": 13, "tail": 9, "purple_body": 18, "canine_humanoid": 1, "pink_stripes": 19, "curved_tail": 6}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "tail": 1, "purple_body": 1, "canine_humanoid": 1, "pink_stripes": 1, "curved_tail": 1}, "extra_evidence": {"animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9128}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "curved_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5958}, "humanoid": {"source": "implied"}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "pink_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5444}, "stripes": {"source": "implied"}, "tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5897}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["solo", "simple_background", "clothing", "canid", "anthro"], "t1": 2.01, "t2": 1.78, "t3": 1.92, "t3s": 1.67, "t3p": 2.98, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=24 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 18, "n_selected": 20, "n_implied": 4, "n_structural": 4, "n_probe": 5, "ret_R": 0.2273, "P": 0.65, "R": 0.5909, "F1": 0.619, "leaf_P": 0.2857, "leaf_R": 0.3333, "leaf_F1": 0.3077, "n_leaf_sel": 14, "n_leaf_gt": 12, "ret_P": 0.2778, "sel_given_ret": 2.6, "over_sel": 0.91, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 9, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.65, "gen_R": 0.5909, "gen_F1": 0.619, "missed": ["chest_tuft", "countershading", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["bear", "countershade_body", "fluffy_fur", "pose", "striped_body", "striped_fur", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "bear", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "felid", "fluffy_fur", "fur", "hand_on_head", "male", "mammal", "pose", "shorts", "solo", "striped_body", "striped_fur", "stripes", "white_chest"], "stage3_selected": ["blue_eyes", "countershade_body", "fluffy_fur", "fur", "hand_on_head", "pose", "shorts", "striped_fur", "white_chest"], "stage3_selected_scores": {"fur": 0.5941, "blue_eyes": 0.5774, "pose": 0.6303, "shorts": 0.5899, "striped_fur": 0.6464, "hand_on_head": 0.6005, "white_chest": 0.9168, "countershade_body": 0.872, "fluffy_fur": 0.6674}, "stage3_selected_ranks": {"fur": 12, "blue_eyes": 14, "pose": 9, "shorts": 13, "striped_fur": 8, "hand_on_head": 11, "white_chest": 2, "countershade_body": 3, "fluffy_fur": 6}, "stage3_selected_phrase_ranks": {"fur": 1, "blue_eyes": 1, "pose": 1, "shorts": 1, "striped_fur": 1, "hand_on_head": 1, "white_chest": 1, "countershade_body": 1, "fluffy_fur": 1}, "extra_evidence": {"bear": {"source": "probe"}, "countershade_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.872}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6674}, "pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6303}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6464}, "white_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9168}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["solo", "felid", "clothing", "bear", "anthro"], "t1": 1.95, "t2": 1.54, "t3": 0.82, "t3s": 1.63, "t3p": 1.99, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 16, "n_selected": 29, "n_implied": 8, "n_structural": 4, "n_probe": 6, "ret_R": 0.1667, "P": 0.3448, "R": 0.8333, "F1": 0.4878, "leaf_P": 0.3333, "leaf_R": 0.5556, "leaf_F1": 0.4167, "n_leaf_sel": 15, "n_leaf_gt": 9, "ret_P": 0.125, "sel_given_ret": 5.0, "over_sel": 2.42, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3448, "gen_R": 0.8333, "gen_F1": 0.4878, "missed": ["alpha_channel", "fingers"], "extra": ["black_body", "black_fur", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "formal", "hair_bun", "holding_mug", "holding_object", "mug", "necktie", "shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "black_body", "black_fur", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "clothed", "clothing", "felid", "feline", "formal", "fur", "hair", "hair_bun", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "shirt", "solo", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "brown_vest", "business_attire", "feline", "formal", "fur", "hair_bun", "holding_mug", "mug", "necktie", "shirt", "simple_background", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"fur": 0.7147, "simple_background": 0.6978, "feline": 0.7062, "shirt": 0.7998, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8404, "hair_bun": 0.6926, "mug": 0.8841, "holding_mug": 0.916, "formal": 0.5993, "business_attire": 0.5558, "brown_vest": 0.8153, "teal_shirt": 0.7475, "white_necktie": 0.6418}, "stage3_selected_ranks": {"fur": 9, "simple_background": 11, "feline": 10, "shirt": 5, "black_fur": 8, "necktie": 7, "vest": 3, "hair_bun": 12, "mug": 2, "holding_mug": 1, "formal": 16, "business_attire": 18, "brown_vest": 4, "teal_shirt": 6, "white_necktie": 14}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "feline": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "vest": 1, "hair_bun": 1, "mug": 1, "holding_mug": 1, "formal": 1, "business_attire": 1, "brown_vest": 1, "teal_shirt": 1, "white_necktie": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7183}, "brown_clothing": {"source": "implied"}, "brown_topwear": {"source": "implied"}, "brown_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8153}, "business_attire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5558}, "formal": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5993}, "hair_bun": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6926}, "holding_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7314}, "shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7998}, "teal_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7475}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8404}, "white_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["text", "solo", "simple_background", "felid", "clothing", "anthro"], "t1": 1.96, "t2": 1.49, "t3": 2.52, "t3s": 1.91, "t3p": 1.51, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 21, "n_selected": 21, "n_implied": 5, "n_structural": 6, "n_probe": 5, "ret_R": 0.5, "P": 0.5238, "R": 0.7857, "F1": 0.6286, "leaf_P": 0.3571, "leaf_R": 0.5, "leaf_F1": 0.4167, "n_leaf_sel": 14, "n_leaf_gt": 10, "ret_P": 0.3333, "sel_given_ret": 1.5714, "over_sel": 1.5, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 9, "attempts_by_n_local": {"24": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5238, "gen_R": 0.7857, "gen_F1": 0.6286, "missed": ["fur", "human", "male"], "extra": ["anthro", "duo", "grin", "humanoid", "mischievous", "raised_arms", "smile", "topless", "trio", "wide_grin"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "clothed", "clothing", "dancing", "duo", "grin", "group", "hair", "haplorhine", "humanoid", "looking_at_viewer", "mammal", "mischievous", "primate", "raised_arms", "smile", "topless", "trio", "wide_grin"], "stage3_selected": ["ape", "bear", "dancing", "grin", "hair", "mischievous", "raised_arms", "simple_background", "wide_grin"], "stage3_selected_scores": {"hair": 0.5495, "simple_background": 0.5541, "bear": 0.5758, "grin": 0.5711, "dancing": 0.5627, "ape": 0.9769, "raised_arms": 0.5526, "mischievous": 0.5449, "wide_grin": 0.5315}, "stage3_selected_ranks": {"hair": 11, "simple_background": 8, "bear": 3, "grin": 5, "dancing": 7, "ape": 1, "raised_arms": 10, "mischievous": 12, "wide_grin": 14}, "stage3_selected_phrase_ranks": {"hair": 1, "simple_background": 1, "bear": 1, "grin": 1, "dancing": 1, "ape": 1, "raised_arms": 1, "mischievous": 1, "wide_grin": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "duo": {"source": "probe"}, "grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5711}, "humanoid": {"source": "structural"}, "mischievous": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5449}, "raised_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5526}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5315}}, "structural": ["trio", "anthro", "humanoid", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "group", "duo", "bear", "anthro"], "t1": 2.29, "t2": 1.97, "t3": 2.21, "t3s": 1.21, "t3p": 0.61, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=24 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/k_sweep_explicit_no_why_seed42_k10.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/eval_results/k_sweep_explicit_no_why_seed42_k2.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T05:55:46.995089", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 2, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 31, "n_selected": 31, "n_implied": 11, "n_structural": 3, "n_probe": 3, "ret_R": 0.1818, "P": 0.3871, "R": 0.5455, "F1": 0.4528, "leaf_P": 0.2222, "leaf_R": 0.3077, "leaf_F1": 0.2581, "n_leaf_sel": 18, "n_leaf_gt": 13, "ret_P": 0.129, "sel_given_ret": 3.0, "over_sel": 1.41, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"34": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3871, "gen_R": 0.5455, "gen_F1": 0.4528, "missed": ["bass_guitar", "fingers", "fur", "guitar", "holding_musical_instrument", "holding_object", "music", "musical_instrument", "plucked_string_instrument", "string_instrument"], "extra": ["5_claws", "atmosphere", "bass_(disambiguation)", "bonfire", "canis", "clawed_fingers", "flowing_hair", "leggings", "legwear", "long_hair", "pastel_background", "playing", "playing_bass", "stockings", "string", "torn_leggings", "torn_legwear", "torn_stockings", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["5_claws", "anthro", "atmosphere", "bass_(disambiguation)", "bonfire", "canid", "canine", "canis", "clawed_fingers", "claws", "clothed", "clothing", "flowing_hair", "hair", "leggings", "legwear", "long_hair", "mammal", "pastel_background", "playing", "playing_bass", "solo", "spade_tail", "stockings", "string", "tail", "torn_clothing", "torn_leggings", "torn_legwear", "torn_stockings", "wolf"], "stage3_selected": ["5_claws", "atmosphere", "bass_(disambiguation)", "bonfire", "clawed_fingers", "claws", "flowing_hair", "invalid_background", "long_hair", "pastel_background", "playing", "playing_bass", "spade_tail", "string", "torn_leggings", "torn_stockings", "wolf"], "stage3_selected_scores": {"claws": 0.6305, "wolf": 0.5983, "long_hair": 0.5166, "spade_tail": 0.872, "clawed_fingers": 0.5176, "playing": 0.4743, "string": 0.6132, "torn_stockings": 0.4607, "flowing_hair": 0.7019, "torn_leggings": 0.4903, "bonfire": 0.4621, "5_claws": 0.5907, "atmosphere": 0.503, "bass_(disambiguation)": 0.5206, "playing_bass": 0.5052, "pastel_background": 0.6263, "invalid_background": 0.6032}, "stage3_selected_ranks": {"claws": 5, "wolf": 10, "long_hair": 20, "spade_tail": 1, "clawed_fingers": 19, "playing": 27, "string": 8, "torn_stockings": 30, "flowing_hair": 2, "torn_leggings": 24, "bonfire": 29, "5_claws": 11, "atmosphere": 23, "bass_(disambiguation)": 18, "playing_bass": 21, "pastel_background": 6, "invalid_background": 9}, "stage3_selected_phrase_ranks": {"claws": 1, "wolf": 1, "long_hair": 2, "spade_tail": 1, "clawed_fingers": 2, "playing": 2, "string": 1, "torn_stockings": 2, "flowing_hair": 1, "torn_leggings": 1, "bonfire": 1, "5_claws": 2, "atmosphere": 1, "bass_(disambiguation)": 1, "playing_bass": 1, "pastel_background": 1, "invalid_background": 1}, "extra_evidence": {"5_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5907}, "atmosphere": {"source": "stage3", "why": "unknown", "retrieval_score": 0.503}, "bass_(disambiguation)": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5206}, "bonfire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4621}, "canis": {"source": "implied"}, "clawed_fingers": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5176}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7019}, "leggings": {"source": "implied"}, "legwear": {"source": "implied"}, "long_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5166}, "pastel_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6263}, "playing": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4743}, "playing_bass": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5052}, "stockings": {"source": "implied"}, "string": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6132}, "torn_leggings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4903}, "torn_legwear": {"source": "implied"}, "torn_stockings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4607}, "wolf": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5983}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "canid", "anthro"], "t1": 2.58, "t2": 2.58, "t3": 5.43, "t3s": 2.84, "t3p": 3.0, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=34 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 23, "n_selected": 12, "n_implied": 1, "n_structural": 5, "n_probe": 3, "ret_R": 0.75, "P": 0.25, "R": 0.75, "F1": 0.375, "leaf_P": 0.2727, "leaf_R": 0.75, "leaf_F1": 0.4, "n_leaf_sel": 11, "n_leaf_gt": 4, "ret_P": 0.1304, "sel_given_ret": 1.0, "over_sel": 3.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 6, "attempts_by_n_local": {"26": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.25, "gen_R": 0.75, "gen_F1": 0.375, "missed": ["smile"], "extra": ["ambiguous_gender", "bear", "big_eyes", "feral", "looking_at_viewer", "mammal", "nude", "spots", "toony"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "bear", "big_eyes", "feral", "looking_at_viewer", "mammal", "nude", "red_nose", "solo", "spots", "tan_body", "toony"], "stage3_selected": ["big_eyes", "red_nose", "spots", "tan_body", "toony", "white_background"], "stage3_selected_scores": {"white_background": 0.6243, "tan_body": 0.6695, "spots": 0.6322, "toony": 0.6076, "big_eyes": 0.7003, "red_nose": 0.7533}, "stage3_selected_ranks": {"white_background": 13, "tan_body": 7, "spots": 12, "toony": 16, "big_eyes": 4, "red_nose": 3}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 2, "spots": 2, "toony": 1, "big_eyes": 1, "red_nose": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "bear": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7003}, "feral": {"source": "structural"}, "looking_at_viewer": {"source": "structural"}, "mammal": {"source": "implied"}, "nude": {"source": "structural"}, "spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6322}, "toony": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6076}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["solo", "simple_background", "bear"], "t1": 3.09, "t2": 1.05, "t3": 1.61, "t3s": 1.49, "t3p": 1.35, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=26 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 31, "n_selected": 28, "n_implied": 6, "n_structural": 4, "n_probe": 5, "ret_R": 0.5, "P": 0.5, "R": 1.0, "F1": 0.6667, "leaf_P": 0.4211, "leaf_R": 0.8889, "leaf_F1": 0.5714, "n_leaf_sel": 19, "n_leaf_gt": 9, "ret_P": 0.2258, "sel_given_ret": 2.0, "over_sel": 2.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"32": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 1.0, "gen_F1": 0.6667, "missed": [], "extra": ["<3", "coat", "expressions", "eyes", "group", "holding_object", "holding_plushie", "looking_at_viewer", "raincoat", "relationship", "rosy_cheeks", "setting", "surprised_look", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "expressions", "eyes", "group", "holding_object", "holding_plushie", "lagomorph", "leporid", "looking_at_viewer", "mammal", "plushie", "rabbit", "raincoat", "relationship", "romantic", "romantic_couple", "rosy_cheeks", "setting", "surprised_look", "teal_eyes", "topwear"], "stage3_selected": ["blue_eyes", "coat", "expressions", "eyes", "group", "holding_plushie", "plushie", "rabbit", "raincoat", "relationship", "romantic_couple", "rosy_cheeks", "setting", "surprised_look", "teal_eyes"], "stage3_selected_scores": {"blue_eyes": 0.6151, "group": 0.3374, "rabbit": 0.5939, "romantic_couple": 0.5621, "coat": 0.6383, "plushie": 0.7455, "teal_eyes": 0.6283, "rosy_cheeks": 0.472, "expressions": 0.5454, "holding_plushie": 0.7793, "raincoat": 0.5262, "surprised_look": 0.6399, "relationship": 0.6206, "setting": 0.5567, "eyes": 0.8767}, "stage3_selected_ranks": {"blue_eyes": 10, "group": 32, "rabbit": 11, "romantic_couple": 13, "coat": 6, "plushie": 3, "teal_eyes": 7, "rosy_cheeks": 28, "expressions": 17, "holding_plushie": 2, "raincoat": 20, "surprised_look": 4, "relationship": 8, "setting": 15, "eyes": 1}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "group": 2, "rabbit": 1, "romantic_couple": 1, "coat": 1, "plushie": 1, "teal_eyes": 1, "rosy_cheeks": 2, "expressions": 2, "holding_plushie": 1, "raincoat": 2, "surprised_look": 1, "relationship": 1, "setting": 1, "eyes": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6383}, "expressions": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5454}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8767}, "group": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3374}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7793}, "looking_at_viewer": {"source": "structural"}, "raincoat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5262}, "relationship": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6206}, "rosy_cheeks": {"source": "stage3", "why": "unknown", "retrieval_score": 0.472}, "setting": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5567}, "surprised_look": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6399}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["duo", "clothing", "blush", "anthro", "<3"], "t1": 1.79, "t2": 1.4, "t3": 2.82, "t3s": 0.98, "t3p": 1.24, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=32 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 29, "n_selected": 43, "n_implied": 17, "n_structural": 4, "n_probe": 5, "ret_R": 0.48, "P": 0.4651, "R": 0.8, "F1": 0.5882, "leaf_P": 0.4286, "leaf_R": 0.6, "leaf_F1": 0.5, "n_leaf_sel": 21, "n_leaf_gt": 15, "ret_P": 0.4138, "sel_given_ret": 1.6667, "over_sel": 1.72, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 21, "attempts_by_n_local": {"30": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4651, "gen_R": 0.8, "gen_F1": 0.5882, "missed": ["lagomorph", "leporid", "looking_at_another", "rabbit", "standing"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "brown_clothing", "brown_shirt", "brown_topwear", "dyed_fur", "eye_markings", "grey_bottomwear", "grey_clothing", "grey_pants", "looking_at_viewer", "marble_fox", "open_mouth", "open_smile", "red_fox", "smile", "t-shirt", "text", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "bottomwear", "brown_clothing", "brown_shirt", "brown_topwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "dyed_fur", "eye_markings", "facial_markings", "fox", "fur", "grey_background", "grey_bottomwear", "grey_clothing", "grey_pants", "head_markings", "looking_at_viewer", "mammal", "marble_fox", "markings", "open_mouth", "open_smile", "overalls", "pants", "red_fox", "shirt", "smile", "t-shirt", "text", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "blue_overalls", "brown_shirt", "claws", "crossed_arms", "dyed_fur", "eye_markings", "facial_markings", "fox", "fur", "grey_background", "grey_pants", "head_markings", "invalid_tag", "marble_fox", "open_mouth", "open_smile", "overalls", "shirt", "t-shirt", "white_shirt"], "stage3_selected_scores": {"fur": 0.6531, "open_mouth": 0.633, "claws": 0.6304, "fox": 0.638, "shirt": 0.7483, "open_smile": 0.5273, "grey_background": 0.6784, "head_markings": 0.6327, "facial_markings": 0.6945, "t-shirt": 0.724, "crossed_arms": 0.7285, "white_shirt": 0.8197, "overalls": 0.8776, "black_pants": 0.833, "eye_markings": 0.6361, "grey_pants": 0.7571, "invalid_tag": 0.5412, "brown_shirt": 0.7774, "marble_fox": 0.5572, "dyed_fur": 0.5284, "blue_overalls": 0.9203}, "stage3_selected_ranks": {"fur": 14, "open_mouth": 18, "claws": 20, "fox": 16, "shirt": 8, "open_smile": 31, "grey_background": 13, "head_markings": 19, "facial_markings": 11, "t-shirt": 10, "crossed_arms": 9, "white_shirt": 4, "overalls": 2, "black_pants": 3, "eye_markings": 17, "grey_pants": 7, "invalid_tag": 28, "brown_shirt": 5, "marble_fox": 27, "dyed_fur": 30, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "fox": 1, "shirt": 1, "open_smile": 2, "grey_background": 1, "head_markings": 2, "facial_markings": 1, "t-shirt": 2, "crossed_arms": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1, "eye_markings": 2, "grey_pants": 2, "invalid_tag": 2, "brown_shirt": 2, "marble_fox": 2, "dyed_fur": 2, "blue_overalls": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.833}, "blue_overalls": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9203}, "brown_clothing": {"source": "implied"}, "brown_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7774}, "brown_topwear": {"source": "implied"}, "dyed_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5284}, "eye_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6361}, "grey_bottomwear": {"source": "implied"}, "grey_clothing": {"source": "implied"}, "grey_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7571}, "looking_at_viewer": {"source": "structural"}, "marble_fox": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5572}, "open_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.633}, "open_smile": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5273}, "red_fox": {"source": "implied"}, "smile": {"source": "implied"}, "t-shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.724}, "text": {"source": "probe"}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8197}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["text", "simple_background", "duo", "clothing", "anthro"], "t1": 2.0, "t2": 1.4, "t3": 3.28, "t3s": 0.96, "t3p": 1.52, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=30 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 34, "n_selected": 31, "n_implied": 5, "n_structural": 5, "n_probe": 8, "ret_R": 0.2308, "P": 0.1935, "R": 0.4615, "F1": 0.2727, "leaf_P": 0.1154, "leaf_R": 0.5, "leaf_F1": 0.1875, "n_leaf_sel": 26, "n_leaf_gt": 6, "ret_P": 0.0882, "sel_given_ret": 2.0, "over_sel": 2.38, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 16, "attempts_by_n_local": {"34": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1935, "gen_R": 0.4615, "gen_F1": 0.2727, "missed": ["dialogue", "fur", "lizard", "reptile", "scalie", "white_body", "white_fur"], "extra": ["air_bubble", "anthro", "bear", "bubble", "canid", "clothed", "clothing", "cracked_ground", "darkner", "darkness", "duo", "felid", "group", "intersex", "laying_on_ground", "light", "lying_on_ground", "note_pad", "speech_bubble", "standing", "standing_over", "taur", "topwear", "vest", "waist"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["air_bubble", "anthro", "bear", "bovid", "bubble", "canid", "caprine", "clothed", "clothing", "cracked_ground", "darkner", "darkness", "duo", "felid", "goat", "group", "human", "intersex", "laying_on_ground", "light", "lying_on_ground", "mammal", "note_pad", "speech_bubble", "standing", "standing_over", "taur", "text", "topwear", "vest", "waist"], "stage3_selected": ["air_bubble", "bubble", "cracked_ground", "darkner", "darkness", "goat", "human", "laying_on_ground", "light", "lying_on_ground", "note_pad", "speech_bubble", "standing", "standing_over", "vest", "waist"], "stage3_selected_scores": {"standing": 0.476, "human": 0.5621, "speech_bubble": 0.5831, "goat": 0.5841, "light": 0.5879, "vest": 0.3206, "bubble": 0.5745, "darkner": 0.4159, "lying_on_ground": 0.5998, "darkness": 0.6022, "air_bubble": 0.4381, "laying_on_ground": 0.5611, "standing_over": 0.5881, "waist": 0.7518, "cracked_ground": 0.3404, "note_pad": 0.4198}, "stage3_selected_ranks": {"standing": 19, "human": 12, "speech_bubble": 9, "goat": 8, "light": 7, "vest": 34, "bubble": 10, "darkner": 26, "lying_on_ground": 4, "darkness": 3, "air_bubble": 21, "laying_on_ground": 13, "standing_over": 6, "waist": 1, "cracked_ground": 32, "note_pad": 25}, "stage3_selected_phrase_ranks": {"standing": 2, "human": 1, "speech_bubble": 1, "goat": 1, "light": 1, "vest": 2, "bubble": 1, "darkner": 2, "lying_on_ground": 1, "darkness": 1, "air_bubble": 2, "laying_on_ground": 2, "standing_over": 1, "waist": 1, "cracked_ground": 2, "note_pad": 2}, "extra_evidence": {"air_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4381}, "anthro": {"source": "probe"}, "bear": {"source": "probe"}, "bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5745}, "canid": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "cracked_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3404}, "darkner": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4159}, "darkness": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6022}, "duo": {"source": "probe"}, "felid": {"source": "probe"}, "group": {"source": "structural"}, "intersex": {"source": "structural"}, "laying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5611}, "light": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5879}, "lying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5998}, "note_pad": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4198}, "speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5831}, "standing": {"source": "stage3", "why": "unknown", "retrieval_score": 0.476}, "standing_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5881}, "taur": {"source": "structural"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3206}, "waist": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7518}}, "structural": ["group", "taur", "intersex", "clothed", "text"], "probe": ["text", "simple_background", "group", "felid", "duo", "canid", "bear", "anthro"], "t1": 2.98, "t2": 1.52, "t3": 3.72, "t3s": 1.33, "t3p": 2.16, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=34 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=1"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 40, "n_selected": 23, "n_implied": 2, "n_structural": 3, "n_probe": 6, "ret_R": 0.5333, "P": 0.3478, "R": 0.5333, "F1": 0.4211, "leaf_P": 0.3, "leaf_R": 0.5, "leaf_F1": 0.375, "n_leaf_sel": 20, "n_leaf_gt": 12, "ret_P": 0.2, "sel_given_ret": 1.0, "over_sel": 1.53, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"41": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3478, "gen_R": 0.5333, "gen_F1": 0.4211, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture", "lying", "sleeping"], "extra": ["<3", "annoyed", "annoyed_expression", "anthro", "bedroom", "blush", "contest", "curtains_open", "dialogue", "felid", "humanoid", "mammal", "membrane_(anatomy)", "purple_membrane", "sleepover"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["<3", "annoyed", "annoyed_expression", "anthro", "bedroom", "blonde_hair", "blue_eyes", "blush", "contest", "curtains_open", "dialogue", "duo", "felid", "green_eyes", "hair", "humanoid", "makeup", "mammal", "membrane_(anatomy)", "purple_hair", "purple_membrane", "sleepover", "text"], "stage3_selected": ["annoyed", "annoyed_expression", "bedroom", "blonde_hair", "blue_eyes", "contest", "curtains_open", "dialogue", "green_eyes", "hair", "makeup", "purple_hair", "purple_membrane", "sleepover", "text"], "stage3_selected_scores": {"hair": 0.6041, "text": 0.6017, "blue_eyes": 0.6023, "dialogue": 0.4457, "green_eyes": 0.5999, "blonde_hair": 0.5995, "purple_hair": 0.5647, "makeup": 0.5972, "bedroom": 0.491, "annoyed": 0.5736, "annoyed_expression": 0.7259, "curtains_open": 0.4199, "contest": 0.3499, "sleepover": 0.3806, "purple_membrane": 0.5791}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "dialogue": 33, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 14, "makeup": 11, "bedroom": 27, "annoyed": 13, "annoyed_expression": 2, "curtains_open": 36, "contest": 41, "sleepover": 40, "purple_membrane": 12}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "dialogue": 2, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "bedroom": 1, "annoyed": 2, "annoyed_expression": 1, "curtains_open": 2, "contest": 2, "sleepover": 1, "purple_membrane": 1}, "extra_evidence": {"<3": {"source": "probe"}, "annoyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5736}, "annoyed_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7259}, "anthro": {"source": "probe"}, "bedroom": {"source": "stage3", "why": "unknown", "retrieval_score": 0.491}, "blush": {"source": "probe"}, "contest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3499}, "curtains_open": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4199}, "dialogue": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4457}, "felid": {"source": "probe"}, "humanoid": {"source": "structural"}, "mammal": {"source": "implied"}, "membrane_(anatomy)": {"source": "implied"}, "purple_membrane": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5791}, "sleepover": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3806}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "felid", "duo", "blush", "anthro", "<3"], "t1": 2.2, "t2": 1.79, "t3": 2.9, "t3s": 2.2, "t3p": 2.2, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=41 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 44, "n_selected": 39, "n_implied": 12, "n_structural": 4, "n_probe": 5, "ret_R": 0.5455, "P": 0.2308, "R": 0.8182, "F1": 0.36, "leaf_P": 0.15, "leaf_R": 0.4286, "leaf_F1": 0.2222, "n_leaf_sel": 20, "n_leaf_gt": 7, "ret_P": 0.1364, "sel_given_ret": 1.5, "over_sel": 3.55, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 22, "attempts_by_n_local": {"47": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2308, "gen_R": 0.8182, "gen_F1": 0.36, "missed": ["blue_eyes", "blue_nose"], "extra": ["animal_humanoid", "anime_eyes", "anthro", "blue_eyebrows", "blue_stripes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "eyebrows", "fluffy_fur", "glistening", "glistening_tail", "humanoid", "inner_ear_fluff", "jumper", "male", "mammal_humanoid", "membrane_(anatomy)", "open_smile", "purple_membrane", "skimpy", "small_mouth", "smile", "strider-orion", "striped_back", "stripes", "tuft", "white_inner_ear_fluff"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anime_eyes", "anthro", "blue_eyebrows", "blue_stripes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "eyebrows", "fluffy_fur", "fur", "glistening", "glistening_tail", "humanoid", "inner_ear_fluff", "jumper", "male", "mammal", "mammal_humanoid", "membrane_(anatomy)", "open_mouth", "open_smile", "purple_body", "purple_membrane", "skimpy", "small_mouth", "smile", "solo", "strider-orion", "striped_back", "stripes", "tuft", "white_body", "white_fur", "white_inner_ear_fluff"], "stage3_selected": ["anime_eyes", "blue_eyebrows", "blue_stripes", "canid_humanoid", "canine_humanoid", "curved_tail", "fluffy_fur", "fur", "glistening_tail", "humanoid", "jumper", "open_smile", "purple_body", "purple_membrane", "simple_background", "skimpy", "small_mouth", "strider-orion", "striped_back", "stripes", "white_fur", "white_inner_ear_fluff"], "stage3_selected_scores": {"fur": 0.5666, "simple_background": 0.5782, "white_fur": 0.5773, "humanoid": 0.6714, "stripes": 0.578, "purple_body": 0.5476, "open_smile": 0.4623, "skimpy": 0.361, "canid_humanoid": 0.8744, "canine_humanoid": 0.9128, "white_inner_ear_fluff": 0.5661, "blue_stripes": 0.5367, "blue_eyebrows": 0.4546, "glistening_tail": 0.5615, "fluffy_fur": 0.5081, "curved_tail": 0.5958, "striped_back": 0.5609, "strider-orion": 0.3692, "anime_eyes": 0.4791, "small_mouth": 0.5007, "purple_membrane": 0.5453, "jumper": 0.4005}, "stage3_selected_ranks": {"fur": 18, "simple_background": 15, "white_fur": 17, "humanoid": 5, "stripes": 16, "purple_body": 24, "open_smile": 39, "skimpy": 46, "canid_humanoid": 2, "canine_humanoid": 1, "white_inner_ear_fluff": 19, "blue_stripes": 28, "blue_eyebrows": 40, "glistening_tail": 20, "fluffy_fur": 32, "curved_tail": 8, "striped_back": 21, "strider-orion": 45, "anime_eyes": 36, "small_mouth": 34, "purple_membrane": 26, "jumper": 44}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "white_fur": 1, "humanoid": 1, "stripes": 1, "purple_body": 1, "open_smile": 2, "skimpy": 2, "canid_humanoid": 2, "canine_humanoid": 1, "white_inner_ear_fluff": 2, "blue_stripes": 2, "blue_eyebrows": 2, "glistening_tail": 2, "fluffy_fur": 2, "curved_tail": 1, "striped_back": 2, "strider-orion": 1, "anime_eyes": 2, "small_mouth": 2, "purple_membrane": 2, "jumper": 2}, "extra_evidence": {"animal_humanoid": {"source": "implied"}, "anime_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4791}, "anthro": {"source": "structural"}, "blue_eyebrows": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4546}, "blue_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5367}, "canid_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8744}, "canine_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9128}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "curved_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5958}, "eyebrows": {"source": "implied"}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5081}, "glistening": {"source": "implied"}, "glistening_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5615}, "humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6714}, "inner_ear_fluff": {"source": "implied"}, "jumper": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4005}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "membrane_(anatomy)": {"source": "implied"}, "open_smile": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4623}, "purple_membrane": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5453}, "skimpy": {"source": "stage3", "why": "unknown", "retrieval_score": 0.361}, "small_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5007}, "smile": {"source": "implied"}, "strider-orion": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3692}, "striped_back": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5609}, "stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.578}, "tuft": {"source": "implied"}, "white_inner_ear_fluff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5661}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["solo", "simple_background", "clothing", "canid", "anthro"], "t1": 2.34, "t2": 1.82, "t3": 7.62, "t3s": 1.54, "t3p": 3.03, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=47 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 37, "n_selected": 44, "n_implied": 11, "n_structural": 5, "n_probe": 5, "ret_R": 0.2273, "P": 0.3636, "R": 0.7273, "F1": 0.4848, "leaf_P": 0.1034, "leaf_R": 0.25, "leaf_F1": 0.1463, "n_leaf_sel": 29, "n_leaf_gt": 12, "ret_P": 0.1351, "sel_given_ret": 3.2, "over_sel": 2.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 26, "attempts_by_n_local": {"38": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3636, "gen_R": 0.7273, "gen_F1": 0.4848, "missed": ["chest_tuft", "muscular", "muscular_anthro", "muscular_male", "topless", "tuft"], "extra": ["avian", "belly", "bird", "confident", "countershade_belly", "countershade_body", "eyes", "fluffy_fur", "gesture", "looking_at_viewer", "muscular_arms", "muscular_legs", "no_irises", "pattern_background", "pattern_kerchief", "poof_effect", "pose", "round_head", "siberian_tiger", "striped_body", "striped_ears", "striped_fur", "suggestive_pose", "tan_bottomwear", "tan_chest", "tan_clothing", "tan_shorts", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "avian", "belly", "bird", "blue_eyes", "bottomwear", "clothed", "clothing", "confident", "countershade_belly", "countershade_body", "countershading", "eyes", "felid", "fluffy_fur", "fur", "gesture", "hand_on_head", "looking_at_viewer", "male", "mammal", "muscular_arms", "muscular_legs", "no_irises", "pantherine", "pattern_background", "pattern_kerchief", "poof_effect", "pose", "round_head", "shorts", "siberian_tiger", "solo", "striped_body", "striped_ears", "striped_fur", "stripes", "suggestive_pose", "tan_bottomwear", "tan_chest", "tan_clothing", "tan_shorts", "tiger", "white_chest"], "stage3_selected": ["blue_eyes", "confident", "countershade_belly", "countershade_body", "eyes", "fluffy_fur", "gesture", "hand_on_head", "muscular_arms", "muscular_legs", "no_irises", "pattern_background", "pattern_kerchief", "poof_effect", "pose", "round_head", "shorts", "siberian_tiger", "striped_body", "striped_ears", "striped_fur", "suggestive_pose", "tan_chest", "tan_shorts", "tiger", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5752, "pose": 0.6281, "shorts": 0.5828, "tiger": 0.5995, "gesture": 0.5905, "striped_body": 0.4104, "striped_fur": 0.6411, "hand_on_head": 0.5966, "pattern_background": 0.5244, "muscular_arms": 0.7958, "muscular_legs": 0.7903, "confident": 0.492, "white_chest": 0.9205, "suggestive_pose": 0.6275, "tan_chest": 0.8501, "no_irises": 0.488, "striped_ears": 0.4517, "countershade_body": 0.8753, "round_head": 0.4815, "fluffy_fur": 0.6703, "countershade_belly": 0.8309, "siberian_tiger": 0.4862, "tan_shorts": 0.5507, "pattern_kerchief": 0.5157, "poof_effect": 0.4448, "eyes": 0.9805}, "stage3_selected_ranks": {"blue_eyes": 25, "pose": 14, "shorts": 24, "tiger": 18, "gesture": 22, "striped_body": 38, "striped_fur": 12, "hand_on_head": 19, "pattern_background": 28, "muscular_arms": 6, "muscular_legs": 7, "confident": 31, "white_chest": 2, "suggestive_pose": 15, "tan_chest": 4, "no_irises": 32, "striped_ears": 35, "countershade_body": 3, "round_head": 34, "fluffy_fur": 10, "countershade_belly": 5, "siberian_tiger": 33, "tan_shorts": 26, "pattern_kerchief": 30, "poof_effect": 36, "eyes": 1}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "pose": 1, "shorts": 1, "tiger": 1, "gesture": 1, "striped_body": 2, "striped_fur": 2, "hand_on_head": 2, "pattern_background": 1, "muscular_arms": 1, "muscular_legs": 2, "confident": 1, "white_chest": 1, "suggestive_pose": 1, "tan_chest": 1, "no_irises": 2, "striped_ears": 1, "countershade_body": 1, "round_head": 2, "fluffy_fur": 1, "countershade_belly": 2, "siberian_tiger": 2, "tan_shorts": 2, "pattern_kerchief": 2, "poof_effect": 1, "eyes": 1}, "extra_evidence": {"avian": {"source": "implied"}, "belly": {"source": "implied"}, "bird": {"source": "probe"}, "confident": {"source": "stage3", "why": "unknown", "retrieval_score": 0.492}, "countershade_belly": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8309}, "countershade_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8753}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9805}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6703}, "gesture": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5905}, "looking_at_viewer": {"source": "structural"}, "muscular_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7958}, "muscular_legs": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7903}, "no_irises": {"source": "stage3", "why": "unknown", "retrieval_score": 0.488}, "pattern_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5244}, "pattern_kerchief": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5157}, "poof_effect": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4448}, "pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6281}, "round_head": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4815}, "siberian_tiger": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4862}, "striped_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4104}, "striped_ears": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4517}, "striped_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6411}, "suggestive_pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6275}, "tan_bottomwear": {"source": "implied"}, "tan_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8501}, "tan_clothing": {"source": "implied"}, "tan_shorts": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5507}, "white_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9205}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["solo", "felid", "clothing", "bird", "anthro"], "t1": 2.2, "t2": 1.72, "t3": 3.56, "t3s": 0.69, "t3p": 1.87, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=38 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 34, "n_selected": 58, "n_implied": 18, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.1724, "R": 0.8333, "F1": 0.2857, "leaf_P": 0.125, "leaf_R": 0.4444, "leaf_F1": 0.1951, "n_leaf_sel": 32, "n_leaf_gt": 9, "ret_P": 0.0882, "sel_given_ret": 3.3333, "over_sel": 4.83, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 37, "attempts_by_n_local": {"37": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1724, "gen_R": 0.8333, "gen_F1": 0.2857, "missed": ["alpha_channel", "fingers"], "extra": ["bag", "black_body", "black_fur", "black_nose", "blowup_background", "breasts", "brown_clothing", "brown_jacket", "brown_topwear", "brown_vest", "business_attire", "can", "clasped_hands", "cleavage", "cleavage_overflow", "container", "dress_shirt", "dyed_fur", "fist", "formal", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "hands_together", "holding_can", "holding_container", "holding_mug", "holding_object", "humor", "jacket", "jacket_vest", "mug", "necktie", "pun", "serious", "shirt", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_necktie", "white_topwear", "wide_hips"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "bag", "black_body", "black_fur", "black_nose", "blowup_background", "breasts", "brown_clothing", "brown_jacket", "brown_topwear", "brown_vest", "business_attire", "can", "clasped_hands", "cleavage", "cleavage_overflow", "clothed", "clothing", "container", "dress_shirt", "dyed_fur", "felid", "feline", "fist", "formal", "fur", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "hands_together", "holding_can", "holding_container", "holding_mug", "holding_object", "humor", "jacket", "jacket_vest", "male", "mammal", "mug", "necktie", "pun", "serious", "shirt", "solo", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_necktie", "white_topwear", "wide_hips"], "stage3_selected": ["bag", "black_fur", "black_nose", "blowup_background", "brown_jacket", "brown_vest", "business_attire", "clasped_hands", "cleavage_overflow", "dress_shirt", "dyed_fur", "felid", "feline", "fist", "formal", "fur", "green_background", "grey_shirt", "hair_bun", "hands_together", "holding_can", "holding_mug", "invalid_background", "invalid_tag", "jacket_vest", "mug", "necktie", "pun", "serious", "shirt", "simple_background", "t-shirt", "teal_shirt", "vest", "white_necktie", "white_topwear", "wide_hips"], "stage3_selected_scores": {"fur": 0.7146, "simple_background": 0.6978, "felid": 0.6418, "feline": 0.7062, "shirt": 0.7998, "wide_hips": 0.4732, "black_nose": 0.6261, "black_fur": 0.7183, "necktie": 0.7314, "t-shirt": 0.7846, "white_topwear": 0.7154, "vest": 0.8403, "green_background": 0.6069, "fist": 0.5544, "bag": 0.5527, "dress_shirt": 0.6132, "hair_bun": 0.6926, "pun": 0.5182, "cleavage_overflow": 0.4789, "mug": 0.8841, "hands_together": 0.5547, "grey_shirt": 0.7582, "serious": 0.5823, "invalid_tag": 0.5751, "holding_mug": 0.916, "clasped_hands": 0.6268, "brown_jacket": 0.7523, "blowup_background": 0.6356, "holding_can": 0.7864, "formal": 0.5993, "business_attire": 0.5558, "dyed_fur": 0.6226, "jacket_vest": 0.772, "brown_vest": 0.8153, "teal_shirt": 0.7474, "white_necktie": 0.6418, "invalid_background": 0.6495}, "stage3_selected_ranks": {"fur": 15, "simple_background": 17, "felid": 20, "feline": 16, "shirt": 5, "wide_hips": 37, "black_nose": 24, "black_fur": 13, "necktie": 12, "t-shirt": 7, "white_topwear": 14, "vest": 3, "green_background": 27, "fist": 33, "bag": 34, "dress_shirt": 26, "hair_bun": 18, "pun": 35, "cleavage_overflow": 36, "mug": 2, "hands_together": 32, "grey_shirt": 9, "serious": 29, "invalid_tag": 30, "holding_mug": 1, "clasped_hands": 23, "brown_jacket": 10, "blowup_background": 22, "holding_can": 6, "formal": 28, "business_attire": 31, "dyed_fur": 25, "jacket_vest": 8, "brown_vest": 4, "teal_shirt": 11, "white_necktie": 21, "invalid_background": 19}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "felid": 2, "feline": 1, "shirt": 1, "wide_hips": 2, "black_nose": 2, "black_fur": 1, "necktie": 1, "t-shirt": 2, "white_topwear": 1, "vest": 1, "green_background": 2, "fist": 2, "bag": 2, "dress_shirt": 2, "hair_bun": 1, "pun": 2, "cleavage_overflow": 1, "mug": 1, "hands_together": 2, "grey_shirt": 1, "serious": 2, "invalid_tag": 1, "holding_mug": 1, "clasped_hands": 1, "brown_jacket": 2, "blowup_background": 2, "holding_can": 2, "formal": 1, "business_attire": 1, "dyed_fur": 2, "jacket_vest": 2, "brown_vest": 1, "teal_shirt": 2, "white_necktie": 2, "invalid_background": 1}, "extra_evidence": {"bag": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5527}, "black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7183}, "black_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6261}, "blowup_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6356}, "breasts": {"source": "implied"}, "brown_clothing": {"source": "implied"}, "brown_jacket": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7523}, "brown_topwear": {"source": "implied"}, "brown_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8153}, "business_attire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5558}, "can": {"source": "implied"}, "clasped_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6268}, "cleavage": {"source": "implied"}, "cleavage_overflow": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4789}, "container": {"source": "implied"}, "dress_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6132}, "dyed_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6226}, "fist": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5544}, "formal": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5993}, "green_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6069}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6926}, "hands_together": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5547}, "holding_can": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7864}, "holding_container": {"source": "implied"}, "holding_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "humor": {"source": "implied"}, "jacket": {"source": "implied"}, "jacket_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.772}, "mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7314}, "pun": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5182}, "serious": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5823}, "shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7998}, "t-shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7846}, "teal_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8403}, "white_clothing": {"source": "implied"}, "white_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6418}, "white_topwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7154}, "wide_hips": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4732}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["text", "solo", "felid", "clothing", "anthro"], "t1": 1.77, "t2": 1.45, "t3": 5.25, "t3s": 4.25, "t3p": 3.94, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=37 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 35, "n_selected": 26, "n_implied": 4, "n_structural": 6, "n_probe": 6, "ret_R": 0.5714, "P": 0.4231, "R": 0.7857, "F1": 0.55, "leaf_P": 0.2353, "leaf_R": 0.4, "leaf_F1": 0.2963, "n_leaf_sel": 17, "n_leaf_gt": 10, "ret_P": 0.2286, "sel_given_ret": 1.375, "over_sel": 1.86, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"36": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4231, "gen_R": 0.7857, "gen_F1": 0.55, "missed": ["fur", "human", "male"], "extra": ["anthro", "arm_hair", "body_hair", "cheeky", "duo", "flash", "front_view", "gorilla", "grin", "humanoid", "loincloth_only", "raised_arms", "smile", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "arm_hair", "bear", "body_hair", "cheeky", "clothed", "clothing", "dancing", "duo", "flash", "front_view", "gorilla", "grin", "group", "hair", "haplorhine", "humanoid", "loincloth_only", "looking_at_viewer", "mammal", "primate", "raised_arms", "smile", "topless", "trio"], "stage3_selected": ["ape", "arm_hair", "bear", "cheeky", "dancing", "flash", "front_view", "gorilla", "grin", "hair", "haplorhine", "loincloth_only", "looking_at_viewer", "raised_arms", "simple_background"], "stage3_selected_scores": {"hair": 0.5455, "simple_background": 0.5491, "looking_at_viewer": 0.5483, "bear": 0.5736, "front_view": 0.4614, "grin": 0.5653, "haplorhine": 0.8324, "dancing": 0.5576, "ape": 0.9767, "raised_arms": 0.5461, "gorilla": 0.8299, "arm_hair": 0.3661, "flash": 0.3198, "loincloth_only": 0.4961, "cheeky": 0.3905}, "stage3_selected_ranks": {"hair": 14, "simple_background": 11, "looking_at_viewer": 12, "bear": 6, "front_view": 24, "grin": 8, "haplorhine": 3, "dancing": 10, "ape": 1, "raised_arms": 13, "gorilla": 4, "arm_hair": 36, "flash": 38, "loincloth_only": 21, "cheeky": 33}, "stage3_selected_phrase_ranks": {"hair": 1, "simple_background": 1, "looking_at_viewer": 1, "bear": 1, "front_view": 2, "grin": 1, "haplorhine": 2, "dancing": 1, "ape": 1, "raised_arms": 1, "gorilla": 1, "arm_hair": 2, "flash": 2, "loincloth_only": 2, "cheeky": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "arm_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3661}, "body_hair": {"source": "implied"}, "cheeky": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3905}, "duo": {"source": "probe"}, "flash": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3198}, "front_view": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4614}, "gorilla": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8299}, "grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5653}, "humanoid": {"source": "structural"}, "loincloth_only": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4961}, "raised_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5461}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "humanoid", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "group", "duo", "clothing", "bear", "anthro"], "t1": 1.79, "t2": 1.83, "t3": 2.31, "t3s": 1.03, "t3p": 1.86, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=36 entity=1 copyright_filtered=1 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/k_sweep_explicit_no_why_seed42_k3.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T05:57:22.405923", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 3, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 41, "n_selected": 47, "n_implied": 20, "n_structural": 3, "n_probe": 4, "ret_R": 0.2727, "P": 0.3617, "R": 0.7727, "F1": 0.4928, "leaf_P": 0.25, "leaf_R": 0.3846, "leaf_F1": 0.303, "n_leaf_sel": 20, "n_leaf_gt": 13, "ret_P": 0.1463, "sel_given_ret": 2.8333, "over_sel": 2.14, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 24, "attempts_by_n_local": {"44": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3617, "gen_R": 0.7727, "gen_F1": 0.4928, "missed": ["fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["5_claws", "arctic_wolf", "blank_expression", "blowup_background", "bottomwear", "canis", "colorful", "colorful_background", "denim", "denim_clothing", "flowing_hair", "glowing", "glowing_hair", "jeans", "long_hair", "maned_wolf", "membrane_(anatomy)", "membranous_wings", "pants", "playing_guitar", "playing_music", "red_hair", "t-pose", "tail_tuft", "torn_bottomwear", "torn_jeans", "torn_pants", "tuft", "wings", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["5_claws", "anthro", "arctic_wolf", "bass_guitar", "blank_expression", "blowup_background", "bottomwear", "canid", "canine", "canis", "claws", "clothed", "clothing", "colorful", "colorful_background", "denim", "denim_clothing", "flowing_hair", "glowing", "glowing_hair", "guitar", "hair", "jeans", "long_hair", "mammal", "maned_wolf", "membrane_(anatomy)", "membranous_wings", "musical_instrument", "pants", "playing_guitar", "playing_music", "plucked_string_instrument", "red_hair", "solo", "spade_tail", "string_instrument", "t-pose", "tail", "tail_tuft", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "tuft", "wings", "wolf"], "stage3_selected": ["5_claws", "abstract_background", "arctic_wolf", "bass_guitar", "blank_expression", "blowup_background", "claws", "colorful_background", "flowing_hair", "glowing_hair", "hair", "long_hair", "maned_wolf", "membranous_wings", "playing_guitar", "red_hair", "spade_tail", "t-pose", "tail", "tail_tuft", "torn_bottomwear", "torn_jeans", "torn_pants", "wolf"], "stage3_selected_scores": {"hair": 0.573, "tail": 0.5659, "claws": 0.5684, "wolf": 0.5782, "long_hair": 0.4286, "red_hair": 0.4258, "membranous_wings": 0.4106, "abstract_background": 0.4924, "tail_tuft": 0.4302, "spade_tail": 0.618, "torn_bottomwear": 0.4362, "torn_pants": 0.4639, "maned_wolf": 0.4599, "arctic_wolf": 0.4908, "playing_guitar": 0.9317, "torn_jeans": 0.4824, "glowing_hair": 0.4302, "bass_guitar": 0.9118, "flowing_hair": 0.5669, "blowup_background": 0.5038, "t-pose": 0.5519, "colorful_background": 0.5132, "5_claws": 0.4601, "blank_expression": 0.4242}, "stage3_selected_ranks": {"hair": 9, "tail": 12, "claws": 10, "wolf": 7, "long_hair": 39, "red_hair": 40, "membranous_wings": 42, "abstract_background": 24, "tail_tuft": 38, "spade_tail": 5, "torn_bottomwear": 34, "torn_pants": 28, "maned_wolf": 31, "arctic_wolf": 26, "playing_guitar": 2, "torn_jeans": 27, "glowing_hair": 36, "bass_guitar": 3, "flowing_hair": 11, "blowup_background": 22, "t-pose": 15, "colorful_background": 21, "5_claws": 30, "blank_expression": 41}, "stage3_selected_phrase_ranks": {"hair": 1, "tail": 1, "claws": 1, "wolf": 1, "long_hair": 2, "red_hair": 3, "membranous_wings": 2, "abstract_background": 3, "tail_tuft": 3, "spade_tail": 1, "torn_bottomwear": 3, "torn_pants": 2, "maned_wolf": 3, "arctic_wolf": 2, "playing_guitar": 1, "torn_jeans": 1, "glowing_hair": 2, "bass_guitar": 2, "flowing_hair": 1, "blowup_background": 2, "t-pose": 2, "colorful_background": 2, "5_claws": 3, "blank_expression": 3}, "extra_evidence": {"5_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4601}, "arctic_wolf": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4908}, "blank_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4242}, "blowup_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5038}, "bottomwear": {"source": "implied"}, "canis": {"source": "implied"}, "colorful": {"source": "implied"}, "colorful_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5132}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5669}, "glowing": {"source": "implied"}, "glowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4302}, "jeans": {"source": "implied"}, "long_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4286}, "maned_wolf": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4599}, "membrane_(anatomy)": {"source": "implied"}, "membranous_wings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4106}, "pants": {"source": "implied"}, "playing_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9317}, "playing_music": {"source": "implied"}, "red_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4258}, "t-pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5519}, "tail_tuft": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4302}, "torn_bottomwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4362}, "torn_jeans": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4824}, "torn_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4639}, "tuft": {"source": "implied"}, "wings": {"source": "implied"}, "wolf": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5782}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "clothing", "canid", "anthro"], "t1": 1.83, "t2": 2.54, "t3": 3.41, "t3s": 3.01, "t3p": 4.22, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=44 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 38, "n_selected": 24, "n_implied": 2, "n_structural": 3, "n_probe": 4, "ret_R": 0.75, "P": 0.1667, "R": 1.0, "F1": 0.2857, "leaf_P": 0.1429, "leaf_R": 0.75, "leaf_F1": 0.24, "n_leaf_sel": 21, "n_leaf_gt": 4, "ret_P": 0.0789, "sel_given_ret": 1.3333, "over_sel": 6.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 19, "attempts_by_n_local": {"40": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1667, "gen_R": 1.0, "gen_F1": 0.2857, "missed": [], "extra": ["<3", "anthro", "big_eyes", "big_iris", "clothed", "clothing", "floating", "glistening", "glistening_eyes", "light_nose", "no_irises", "nose", "pale_body", "pink_mouth", "smiling_at_viewer", "spots", "spotted_face", "tan_chest", "toony", "unknown_species"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["<3", "anthro", "big_eyes", "big_iris", "clothed", "clothing", "floating", "glistening", "glistening_eyes", "light_nose", "no_irises", "nose", "pale_body", "pink_mouth", "red_nose", "smile", "smiling_at_viewer", "solo", "spots", "spotted_face", "tan_body", "tan_chest", "toony", "unknown_species"], "stage3_selected": ["big_eyes", "big_iris", "floating", "glistening_eyes", "light_nose", "no_irises", "nose", "pale_body", "pink_mouth", "red_nose", "smile", "smiling_at_viewer", "spots", "spotted_face", "tan_body", "tan_chest", "toony", "unknown_species", "white_background"], "stage3_selected_scores": {"smile": 0.5956, "white_background": 0.6072, "tan_body": 0.6582, "spots": 0.6224, "toony": 0.5172, "glistening_eyes": 0.494, "unknown_species": 0.5802, "smiling_at_viewer": 0.5323, "big_eyes": 0.6934, "red_nose": 0.7475, "floating": 0.6454, "tan_chest": 0.6867, "spotted_face": 0.6973, "no_irises": 0.4925, "pink_mouth": 0.6468, "light_nose": 0.6631, "big_iris": 0.566, "pale_body": 0.4677, "nose": 0.8611}, "stage3_selected_ranks": {"smile": 19, "white_background": 18, "tan_body": 10, "spots": 16, "toony": 31, "glistening_eyes": 32, "unknown_species": 20, "smiling_at_viewer": 28, "big_eyes": 7, "red_nose": 3, "floating": 12, "tan_chest": 8, "spotted_face": 5, "no_irises": 33, "pink_mouth": 11, "light_nose": 9, "big_iris": 23, "pale_body": 38, "nose": 2}, "stage3_selected_phrase_ranks": {"smile": 2, "white_background": 1, "tan_body": 3, "spots": 3, "toony": 1, "glistening_eyes": 2, "unknown_species": 1, "smiling_at_viewer": 3, "big_eyes": 1, "red_nose": 1, "floating": 1, "tan_chest": 2, "spotted_face": 2, "no_irises": 3, "pink_mouth": 1, "light_nose": 2, "big_iris": 3, "pale_body": 3, "nose": 1}, "extra_evidence": {"<3": {"source": "probe"}, "anthro": {"source": "structural"}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6934}, "big_iris": {"source": "stage3", "why": "unknown", "retrieval_score": 0.566}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "floating": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6454}, "glistening": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.494}, "light_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6631}, "no_irises": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4925}, "nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8611}, "pale_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4677}, "pink_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6468}, "smiling_at_viewer": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5323}, "spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6224}, "spotted_face": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6973}, "tan_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6867}, "toony": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5172}, "unknown_species": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5802}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "simple_background", "anthro", "<3"], "t1": 1.26, "t2": 1.13, "t3": 1.33, "t3s": 1.5, "t3p": 1.23, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=40 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=4"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 46, "n_selected": 38, "n_implied": 8, "n_structural": 4, "n_probe": 5, "ret_R": 0.6429, "P": 0.3421, "R": 0.9286, "F1": 0.5, "leaf_P": 0.25, "leaf_R": 0.6667, "leaf_F1": 0.3636, "n_leaf_sel": 24, "n_leaf_gt": 9, "ret_P": 0.1957, "sel_given_ret": 1.4444, "over_sel": 2.71, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 24, "attempts_by_n_local": {"47": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3421, "gen_R": 0.9286, "gen_F1": 0.5, "missed": ["romantic_couple"], "extra": ["<3", "blush_lines", "cheek_tuft", "close-up", "coat", "diaper", "eyes_closed", "facial_tuft", "glistening", "glistening_eyes", "gradient_eyes", "half-closed_eyes", "holding_object", "holding_plushie", "looking_at_viewer", "narrowed_eyes", "pull-ups_(diaper)", "raincoat", "rosy_cheeks", "round_eyes", "small_eyes", "surprised_expression", "surprised_look", "topwear", "tuft"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "blush_lines", "cheek_tuft", "close-up", "clothed", "clothing", "coat", "diaper", "duo", "eyes_closed", "facial_tuft", "glistening", "glistening_eyes", "gradient_eyes", "half-closed_eyes", "holding_object", "holding_plushie", "lagomorph", "leporid", "looking_at_viewer", "mammal", "narrowed_eyes", "plushie", "pull-ups_(diaper)", "rabbit", "raincoat", "romantic", "rosy_cheeks", "round_eyes", "small_eyes", "surprised_expression", "surprised_look", "teal_eyes", "topwear", "tuft"], "stage3_selected": ["blue_eyes", "blush_lines", "cheek_tuft", "close-up", "coat", "duo", "eyes_closed", "glistening_eyes", "gradient_eyes", "half-closed_eyes", "holding_plushie", "lagomorph", "leporid", "plushie", "pull-ups_(diaper)", "rabbit", "raincoat", "romantic", "rosy_cheeks", "round_eyes", "small_eyes", "surprised_expression", "surprised_look", "teal_eyes"], "stage3_selected_scores": {"duo": 0.3257, "blue_eyes": 0.6151, "eyes_closed": 0.4028, "lagomorph": 0.5325, "leporid": 0.5311, "rabbit": 0.5939, "half-closed_eyes": 0.5138, "cheek_tuft": 0.4678, "romantic": 0.5603, "close-up": 0.3803, "blush_lines": 0.4756, "glistening_eyes": 0.4543, "coat": 0.6383, "plushie": 0.7455, "teal_eyes": 0.6283, "surprised_expression": 0.639, "rosy_cheeks": 0.472, "holding_plushie": 0.7793, "raincoat": 0.5262, "small_eyes": 0.6187, "surprised_look": 0.6399, "round_eyes": 0.4887, "pull-ups_(diaper)": 0.5206, "gradient_eyes": 0.4784}, "stage3_selected_ranks": {"duo": 47, "blue_eyes": 12, "eyes_closed": 44, "lagomorph": 23, "leporid": 24, "rabbit": 13, "half-closed_eyes": 30, "cheek_tuft": 39, "romantic": 16, "close-up": 45, "blush_lines": 36, "glistening_eyes": 42, "coat": 7, "plushie": 3, "teal_eyes": 8, "surprised_expression": 6, "rosy_cheeks": 37, "holding_plushie": 2, "raincoat": 26, "small_eyes": 11, "surprised_look": 5, "round_eyes": 32, "pull-ups_(diaper)": 29, "gradient_eyes": 35}, "stage3_selected_phrase_ranks": {"duo": 3, "blue_eyes": 1, "eyes_closed": 2, "lagomorph": 2, "leporid": 3, "rabbit": 1, "half-closed_eyes": 2, "cheek_tuft": 3, "romantic": 2, "close-up": 3, "blush_lines": 3, "glistening_eyes": 3, "coat": 1, "plushie": 1, "teal_eyes": 1, "surprised_expression": 2, "rosy_cheeks": 2, "holding_plushie": 1, "raincoat": 2, "small_eyes": 1, "surprised_look": 1, "round_eyes": 1, "pull-ups_(diaper)": 2, "gradient_eyes": 2}, "extra_evidence": {"<3": {"source": "probe"}, "blush_lines": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4756}, "cheek_tuft": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4678}, "close-up": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3803}, "coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6383}, "diaper": {"source": "implied"}, "eyes_closed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4028}, "facial_tuft": {"source": "implied"}, "glistening": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4543}, "gradient_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4784}, "half-closed_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5138}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7793}, "looking_at_viewer": {"source": "structural"}, "narrowed_eyes": {"source": "implied"}, "pull-ups_(diaper)": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5206}, "raincoat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5262}, "rosy_cheeks": {"source": "stage3", "why": "unknown", "retrieval_score": 0.472}, "round_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4887}, "small_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6187}, "surprised_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.639}, "surprised_look": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6399}, "topwear": {"source": "implied"}, "tuft": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["duo", "clothing", "blush", "anthro", "<3"], "t1": 1.85, "t2": 1.4, "t3": 4.65, "t3s": 1.07, "t3p": 1.53, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=47 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 51, "n_selected": 65, "n_implied": 20, "n_structural": 4, "n_probe": 6, "ret_R": 0.56, "P": 0.3692, "R": 0.96, "F1": 0.5333, "leaf_P": 0.2353, "leaf_R": 0.5333, "leaf_F1": 0.3265, "n_leaf_sel": 34, "n_leaf_gt": 15, "ret_P": 0.2745, "sel_given_ret": 1.7143, "over_sel": 2.6, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 39, "attempts_by_n_local": {"52": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3692, "gen_R": 0.96, "gen_F1": 0.5333, "missed": ["looking_at_another"], "extra": ["4_claws", "5_claws", "black_bottomwear", "black_clothing", "black_pants", "blowup_background", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "blush", "buckteeth", "eye_markings", "fennec_fox", "front_view", "geometric_background", "gloves_(marking)", "grid_background", "long_arms", "looking_at_viewer", "on_one_leg", "open_mouth", "open_smile", "pattern_background", "red_mouth", "smile", "snout", "snout_markings", "t-shirt", "tan_body", "tan_bottomwear", "tan_clothing", "tan_fur", "tan_pants", "teeth", "undershirt", "white_clothing", "white_shirt", "white_topwear", "wide_eyed", "yellow_background"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["4_claws", "5_claws", "anthro", "black_bottomwear", "black_clothing", "black_pants", "blowup_background", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "blush", "bottomwear", "buckteeth", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "eye_markings", "facial_markings", "fennec_fox", "fox", "front_view", "fur", "geometric_background", "gloves_(marking)", "grey_background", "grid_background", "head_markings", "lagomorph", "leporid", "long_arms", "looking_at_viewer", "mammal", "markings", "on_one_leg", "open_mouth", "open_smile", "overalls", "pants", "pattern_background", "rabbit", "red_mouth", "shirt", "smile", "snout", "snout_markings", "standing", "t-shirt", "tan_body", "tan_bottomwear", "tan_clothing", "tan_fur", "tan_pants", "teeth", "topwear", "undershirt", "white_clothing", "white_shirt", "white_topwear", "wide_eyed", "yellow_background"], "stage3_selected": ["4_claws", "5_claws", "black_pants", "blowup_background", "blue_overalls", "blue_shirt", "buckteeth", "claws", "crossed_arms", "eye_markings", "facial_markings", "fennec_fox", "fox", "front_view", "fur", "gloves_(marking)", "grey_background", "grid_background", "head_markings", "invalid_tag", "lagomorph", "long_arms", "on_one_leg", "open_mouth", "open_smile", "overalls", "rabbit", "red_mouth", "shirt", "snout_markings", "standing", "t-shirt", "tan_fur", "tan_pants", "undershirt", "white_shirt", "white_topwear", "wide_eyed", "yellow_background"], "stage3_selected_scores": {"fur": 0.654, "open_mouth": 0.6338, "claws": 0.6311, "standing": 0.6886, "fox": 0.6387, "shirt": 0.7491, "lagomorph": 0.5942, "rabbit": 0.6517, "front_view": 0.5154, "tan_fur": 0.52, "open_smile": 0.528, "grey_background": 0.6792, "gloves_(marking)": 0.6271, "head_markings": 0.6334, "buckteeth": 0.532, "facial_markings": 0.6951, "t-shirt": 0.7246, "crossed_arms": 0.7292, "wide_eyed": 0.4677, "white_topwear": 0.7676, "white_shirt": 0.8202, "on_one_leg": 0.5769, "yellow_background": 0.5951, "overalls": 0.878, "black_pants": 0.8334, "blue_shirt": 0.6699, "eye_markings": 0.6366, "snout_markings": 0.6219, "invalid_tag": 0.5414, "undershirt": 0.7069, "4_claws": 0.5923, "blowup_background": 0.5948, "long_arms": 0.586, "tan_pants": 0.7502, "grid_background": 0.6147, "blue_overalls": 0.9204, "5_claws": 0.6023, "red_mouth": 0.545, "fennec_fox": 0.5037}, "stage3_selected_ranks": {"fur": 21, "open_mouth": 25, "claws": 27, "standing": 16, "fox": 23, "shirt": 10, "lagomorph": 36, "rabbit": 22, "front_view": 51, "tan_fur": 50, "open_smile": 49, "grey_background": 18, "gloves_(marking)": 28, "head_markings": 26, "buckteeth": 47, "facial_markings": 15, "t-shirt": 12, "crossed_arms": 11, "wide_eyed": 53, "white_topwear": 6, "white_shirt": 4, "on_one_leg": 39, "yellow_background": 34, "overalls": 2, "black_pants": 3, "blue_shirt": 20, "eye_markings": 24, "snout_markings": 29, "invalid_tag": 45, "undershirt": 13, "4_claws": 37, "blowup_background": 35, "long_arms": 38, "tan_pants": 9, "grid_background": 31, "blue_overalls": 1, "5_claws": 32, "red_mouth": 44, "fennec_fox": 52}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "standing": 1, "fox": 1, "shirt": 1, "lagomorph": 3, "rabbit": 1, "front_view": 3, "tan_fur": 3, "open_smile": 2, "grey_background": 1, "gloves_(marking)": 3, "head_markings": 2, "buckteeth": 3, "facial_markings": 1, "t-shirt": 2, "crossed_arms": 1, "wide_eyed": 3, "white_topwear": 3, "white_shirt": 1, "on_one_leg": 2, "yellow_background": 3, "overalls": 1, "black_pants": 1, "blue_shirt": 3, "eye_markings": 2, "snout_markings": 3, "invalid_tag": 3, "undershirt": 3, "4_claws": 3, "blowup_background": 3, "long_arms": 2, "tan_pants": 3, "grid_background": 2, "blue_overalls": 1, "5_claws": 2, "red_mouth": 2, "fennec_fox": 3}, "extra_evidence": {"4_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5923}, "5_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6023}, "black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8334}, "blowup_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5948}, "blue_clothing": {"source": "implied"}, "blue_overalls": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9204}, "blue_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6699}, "blue_topwear": {"source": "implied"}, "blush": {"source": "probe"}, "buckteeth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.532}, "eye_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6366}, "fennec_fox": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5037}, "front_view": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5154}, "geometric_background": {"source": "implied"}, "gloves_(marking)": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6271}, "grid_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6147}, "long_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.586}, "looking_at_viewer": {"source": "structural"}, "on_one_leg": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5769}, "open_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6338}, "open_smile": {"source": "stage3", "why": "unknown", "retrieval_score": 0.528}, "pattern_background": {"source": "implied"}, "red_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.545}, "smile": {"source": "implied"}, "snout": {"source": "implied"}, "snout_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6219}, "t-shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7246}, "tan_body": {"source": "implied"}, "tan_bottomwear": {"source": "implied"}, "tan_clothing": {"source": "implied"}, "tan_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.52}, "tan_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7502}, "teeth": {"source": "implied"}, "undershirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7069}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8202}, "white_topwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7676}, "wide_eyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4677}, "yellow_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5951}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "duo", "clothing", "canid", "blush", "anthro"], "t1": 1.91, "t2": 1.47, "t3": 7.15, "t3s": 1.9, "t3p": 2.93, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=52 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 67, "n_selected": 64, "n_implied": 10, "n_structural": 5, "n_probe": 5, "ret_R": 0.3077, "P": 0.1406, "R": 0.6923, "F1": 0.2338, "leaf_P": 0.0435, "leaf_R": 0.3333, "leaf_F1": 0.0769, "n_leaf_sel": 46, "n_leaf_gt": 6, "ret_P": 0.0597, "sel_given_ret": 2.25, "over_sel": 4.92, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 47, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "5": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 1, "char_F1": 0.0, "gen_P": 0.1429, "gen_R": 0.6923, "gen_F1": 0.2368, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["2_panel_comic", "3_panel_comic", "4_panel_comic", "<3", "agamid", "air_bubble", "anthro", "bear", "capricorn", "carrying_over_shoulder", "cjk_character", "clothed", "clothing", "dark", "darkner", "darkness", "duo", "duo_focus", "empty_speech_bubble", "fan_character", "felid", "figurine", "frilled_lizard", "group", "human_only", "laying_on_ground", "light", "lying_on_ground", "male_human", "mask", "medical_instrument", "monitor_lizard", "not_furry", "note", "note_pad", "notebook", "on_ground", "oxygen_mask", "pear-shaped_figure", "question_mark", "sad", "scientific_instrument", "soap_bubbles", "speech_bubble", "standing", "standing_over", "stick_figure", "striped_body", "stripes", "sunlight", "taur", "thought_bubble", "trio", "waist", "wide_hips"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["2_panel_comic", "3_panel_comic", "4_panel_comic", "<3", "agamid", "air_bubble", "anthro", "bear", "bovid", "capricorn", "caprine", "carrying_over_shoulder", "cjk_character", "clothed", "clothing", "dark", "darkner", "darkness", "duo", "duo_focus", "empty_speech_bubble", "fan_character", "felid", "figurine", "frilled_lizard", "goat", "group", "human", "human_only", "laying_on_ground", "light", "lizard", "lying_on_ground", "male_human", "mammal", "mask", "medical_instrument", "monitor_lizard", "not_furry", "note", "note_pad", "notebook", "on_ground", "oxygen_mask", "pear-shaped_figure", "question_mark", "reptile", "sad", "scalie", "scientific_instrument", "soap_bubbles", "speech_bubble", "standing", "standing_over", "stick_figure", "striped_body", "stripes", "sunlight", "taur", "text", "thought_bubble", "trio", "waist", "wide_hips"], "stage3_selected": ["2_panel_comic", "3_panel_comic", "4_panel_comic", "air_bubble", "capricorn", "caprine", "carrying_over_shoulder", "cjk_character", "dark", "darkner", "darkness", "duo", "duo_focus", "empty_speech_bubble", "fan_character", "figurine", "frilled_lizard", "goat", "group", "human", "human_only", "laying_on_ground", "light", "lizard", "lying_on_ground", "male_human", "mask", "monitor_lizard", "note", "note_pad", "notebook", "on_ground", "oxygen_mask", "pear-shaped_figure", "question_mark", "sad", "soap_bubbles", "speech_bubble", "standing", "standing_over", "stick_figure", "striped_body", "stripes", "sunlight", "thought_bubble", "trio", "waist"], "stage3_selected_scores": {"duo": 0.379, "group": 0.4732, "standing": 0.4714, "human": 0.5598, "speech_bubble": 0.5792, "stripes": 0.4622, "fan_character": 0.4163, "caprine": 0.47, "trio": 0.3761, "lizard": 0.5978, "striped_body": 0.3966, "goat": 0.5805, "mask": 0.3754, "light": 0.5849, "question_mark": 0.3121, "duo_focus": 0.3571, "on_ground": 0.4857, "thought_bubble": 0.475, "sad": 0.4012, "sunlight": 0.4787, "human_only": 0.4179, "dark": 0.4135, "darkner": 0.4149, "monitor_lizard": 0.4607, "pear-shaped_figure": 0.4006, "lying_on_ground": 0.5972, "notebook": 0.4057, "darkness": 0.6, "air_bubble": 0.4378, "note": 0.5684, "figurine": 0.5577, "laying_on_ground": 0.5581, "stick_figure": 0.4166, "soap_bubbles": 0.4325, "frilled_lizard": 0.4601, "standing_over": 0.5829, "oxygen_mask": 0.3762, "3_panel_comic": 0.4314, "carrying_over_shoulder": 0.409, "4_panel_comic": 0.4507, "waist": 0.7512, "2_panel_comic": 0.4346, "empty_speech_bubble": 0.5477, "capricorn": 0.5212, "note_pad": 0.4185, "male_human": 0.4242, "cjk_character": 0.4245}, "stage3_selected_ranks": {"duo": 53, "group": 21, "standing": 23, "human": 12, "speech_bubble": 9, "stripes": 25, "fan_character": 41, "caprine": 24, "trio": 56, "lizard": 3, "striped_body": 51, "goat": 8, "mask": 57, "light": 6, "question_mark": 67, "duo_focus": 62, "on_ground": 18, "thought_bubble": 20, "sad": 49, "sunlight": 19, "human_only": 39, "dark": 44, "darkner": 42, "monitor_lizard": 26, "pear-shaped_figure": 50, "lying_on_ground": 4, "notebook": 48, "darkness": 2, "air_bubble": 30, "note": 11, "figurine": 14, "laying_on_ground": 13, "stick_figure": 40, "soap_bubbles": 32, "frilled_lizard": 27, "standing_over": 7, "oxygen_mask": 55, "3_panel_comic": 33, "carrying_over_shoulder": 45, "4_panel_comic": 29, "waist": 1, "2_panel_comic": 31, "empty_speech_bubble": 15, "capricorn": 16, "note_pad": 38, "male_human": 36, "cjk_character": 34}, "stage3_selected_phrase_ranks": {"duo": 3, "group": 1, "standing": 2, "human": 1, "speech_bubble": 1, "stripes": 1, "fan_character": 2, "caprine": 3, "trio": 2, "lizard": 1, "striped_body": 3, "goat": 1, "mask": 3, "light": 1, "question_mark": 3, "duo_focus": 3, "on_ground": 3, "thought_bubble": 3, "sad": 3, "sunlight": 3, "human_only": 3, "dark": 3, "darkner": 2, "monitor_lizard": 2, "pear-shaped_figure": 3, "lying_on_ground": 1, "notebook": 3, "darkness": 1, "air_bubble": 2, "note": 1, "figurine": 2, "laying_on_ground": 2, "stick_figure": 3, "soap_bubbles": 3, "frilled_lizard": 3, "standing_over": 1, "oxygen_mask": 2, "3_panel_comic": 3, "carrying_over_shoulder": 3, "4_panel_comic": 1, "waist": 1, "2_panel_comic": 2, "empty_speech_bubble": 1, "capricorn": 2, "note_pad": 2, "male_human": 2, "cjk_character": 1}, "extra_evidence": {"2_panel_comic": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4346}, "3_panel_comic": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4314}, "4_panel_comic": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4507}, "<3": {"source": "probe"}, "agamid": {"source": "implied"}, "air_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4378}, "anthro": {"source": "structural"}, "bear": {"source": "probe"}, "capricorn": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5212}, "carrying_over_shoulder": {"source": "stage3", "why": "unknown", "retrieval_score": 0.409}, "cjk_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4245}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "dark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4135}, "darkner": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4149}, "darkness": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6}, "duo": {"source": "stage3", "why": "unknown", "retrieval_score": 0.379}, "duo_focus": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3571}, "empty_speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5477}, "fan_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4163}, "felid": {"source": "probe"}, "figurine": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5577}, "frilled_lizard": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4601}, "group": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4732}, "human_only": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4179}, "laying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5581}, "light": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5849}, "lying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5972}, "male_human": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4242}, "mask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3754}, "medical_instrument": {"source": "implied"}, "monitor_lizard": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4607}, "not_furry": {"source": "implied"}, "note": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5684}, "note_pad": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4185}, "notebook": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4057}, "on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4857}, "oxygen_mask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3762}, "pear-shaped_figure": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4006}, "question_mark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3121}, "sad": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4012}, "scientific_instrument": {"source": "implied"}, "soap_bubbles": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4325}, "speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5792}, "standing": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4714}, "standing_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5829}, "stick_figure": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4166}, "striped_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3966}, "stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4622}, "sunlight": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4787}, "taur": {"source": "structural"}, "thought_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.475}, "trio": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3761}, "waist": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7512}, "wide_hips": {"source": "implied"}}, "structural": ["group", "anthro", "taur", "clothed", "text"], "probe": ["group", "felid", "bear", "anthro", "<3"], "t1": 2.44, "t2": 1.95, "t3": 4.96, "t3s": 1.94, "t3p": 2.26, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=65 entity=0 copyright_filtered=2 generic_char_to_general=2 unknown_type=2"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 67, "n_selected": 43, "n_implied": 6, "n_structural": 3, "n_probe": 6, "ret_R": 0.7333, "P": 0.2326, "R": 0.6667, "F1": 0.3448, "leaf_P": 0.2222, "leaf_R": 0.6667, "leaf_F1": 0.3333, "n_leaf_sel": 36, "n_leaf_gt": 12, "ret_P": 0.1642, "sel_given_ret": 0.9091, "over_sel": 2.87, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 31, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "8": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2326, "gen_R": 0.6667, "gen_F1": 0.3448, "missed": ["angry", "bed", "eyes_closed", "furniture", "sleeping"], "extra": ["3rd_party_watermark", "<3", "annoyed", "anthro", "bed_covers", "bedding", "bedroom", "big_eyes", "blush", "clothing", "color_swatch", "contest", "curtains_open", "dialogue", "distracting_watermark", "expressions", "felid", "green_ears", "highlights_(coloring)", "humanoid", "long_hair", "mammal", "pajamas", "portuguese_text", "purple_background", "purple_eyes", "purple_hands", "purple_highlights", "restricted_palette", "sleeping_together", "sleepover", "tired", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["3rd_party_watermark", "<3", "annoyed", "anthro", "bed_covers", "bedding", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "blush", "clothing", "color_swatch", "contest", "curtains_open", "dialogue", "distracting_watermark", "duo", "expressions", "eyeshadow", "felid", "green_ears", "green_eyes", "hair", "highlights_(coloring)", "humanoid", "long_hair", "lying", "makeup", "mammal", "pajamas", "portuguese_text", "purple_background", "purple_eyes", "purple_hair", "purple_hands", "purple_highlights", "restricted_palette", "sleeping_together", "sleepover", "text", "tired", "watermark"], "stage3_selected": ["3rd_party_watermark", "annoyed", "bed_covers", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "color_swatch", "contest", "curtains_open", "dialogue", "distracting_watermark", "expressions", "eyeshadow", "green_ears", "green_eyes", "long_hair", "lying", "makeup", "pajamas", "portuguese_text", "purple_background", "purple_eyes", "purple_hair", "purple_hands", "purple_highlights", "restricted_palette", "sleeping_together", "sleepover", "text", "tired"], "stage3_selected_scores": {"text": 0.6017, "blue_eyes": 0.6023, "dialogue": 0.4457, "lying": 0.4504, "green_eyes": 0.5999, "long_hair": 0.4595, "blonde_hair": 0.5995, "purple_eyes": 0.434, "purple_hair": 0.5647, "makeup": 0.5972, "eyeshadow": 0.4769, "bedroom": 0.491, "purple_background": 0.4971, "big_eyes": 0.4297, "annoyed": 0.5736, "restricted_palette": 0.4777, "tired": 0.5551, "color_swatch": 0.4623, "distracting_watermark": 0.5007, "pajamas": 0.3762, "green_ears": 0.4402, "purple_highlights": 0.4307, "bed_covers": 0.4156, "curtains_open": 0.4199, "expressions": 0.5449, "sleeping_together": 0.5093, "3rd_party_watermark": 0.3981, "contest": 0.3499, "sleepover": 0.5276, "purple_hands": 0.5398, "portuguese_text": 0.4433}, "stage3_selected_ranks": {"text": 8, "blue_eyes": 7, "dialogue": 44, "lying": 41, "green_eyes": 9, "long_hair": 38, "blonde_hair": 10, "purple_eyes": 49, "purple_hair": 14, "makeup": 11, "eyeshadow": 35, "bedroom": 31, "purple_background": 29, "big_eyes": 54, "annoyed": 13, "restricted_palette": 34, "tired": 16, "color_swatch": 37, "distracting_watermark": 28, "pajamas": 65, "green_ears": 47, "purple_highlights": 51, "bed_covers": 57, "curtains_open": 55, "expressions": 18, "sleeping_together": 24, "3rd_party_watermark": 59, "contest": 67, "sleepover": 23, "purple_hands": 19, "portuguese_text": 45}, "stage3_selected_phrase_ranks": {"text": 1, "blue_eyes": 1, "dialogue": 2, "lying": 1, "green_eyes": 1, "long_hair": 3, "blonde_hair": 1, "purple_eyes": 2, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "bedroom": 1, "purple_background": 3, "big_eyes": 3, "annoyed": 2, "restricted_palette": 2, "tired": 2, "color_swatch": 3, "distracting_watermark": 1, "pajamas": 3, "green_ears": 3, "purple_highlights": 3, "bed_covers": 3, "curtains_open": 2, "expressions": 3, "sleeping_together": 2, "3rd_party_watermark": 3, "contest": 2, "sleepover": 1, "purple_hands": 2, "portuguese_text": 3}, "extra_evidence": {"3rd_party_watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3981}, "<3": {"source": "probe"}, "annoyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5736}, "anthro": {"source": "probe"}, "bed_covers": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4156}, "bedding": {"source": "implied"}, "bedroom": {"source": "stage3", "why": "unknown", "retrieval_score": 0.491}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4297}, "blush": {"source": "probe"}, "clothing": {"source": "implied"}, "color_swatch": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4623}, "contest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3499}, "curtains_open": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4199}, "dialogue": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4457}, "distracting_watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5007}, "expressions": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5449}, "felid": {"source": "probe"}, "green_ears": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4402}, "highlights_(coloring)": {"source": "implied"}, "humanoid": {"source": "structural"}, "long_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4595}, "mammal": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3762}, "portuguese_text": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4433}, "purple_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4971}, "purple_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.434}, "purple_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5398}, "purple_highlights": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4307}, "restricted_palette": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4777}, "sleeping_together": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5093}, "sleepover": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5276}, "tired": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5551}, "watermark": {"source": "implied"}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "felid", "duo", "blush", "anthro", "<3"], "t1": 2.63, "t2": 1.98, "t3": 7.61, "t3s": 0.68, "t3p": 2.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=68 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 68, "n_selected": 35, "n_implied": 6, "n_structural": 4, "n_probe": 5, "ret_R": 0.5455, "P": 0.2857, "R": 0.9091, "F1": 0.4348, "leaf_P": 0.2381, "leaf_R": 0.7143, "leaf_F1": 0.3571, "n_leaf_sel": 21, "n_leaf_gt": 7, "ret_P": 0.0882, "sel_given_ret": 1.6667, "over_sel": 3.18, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 26, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "13": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2857, "gen_R": 0.9091, "gen_F1": 0.4348, "missed": ["open_mouth"], "extra": ["animal_humanoid", "anthro", "blue_stripes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "fluffy_fur", "fox_humanoid", "gradient_tail", "humanoid", "long_tail", "male", "mammal_humanoid", "midair", "pale_body", "pink_stripes", "riding", "skimpy", "slim_humanoid", "stripes", "tail", "tan_stripes", "white_nose"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anthro", "blue_eyes", "blue_nose", "blue_stripes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "fluffy_fur", "fox_humanoid", "fur", "gradient_tail", "humanoid", "long_tail", "male", "mammal", "mammal_humanoid", "midair", "pale_body", "pink_stripes", "purple_body", "riding", "skimpy", "slim_humanoid", "solo", "stripes", "tail", "tan_stripes", "white_body", "white_fur", "white_nose"], "stage3_selected": ["blue_eyes", "blue_nose", "blue_stripes", "blurred_background", "canid_humanoid", "canine_humanoid", "curved_tail", "fluffy_fur", "fox_humanoid", "fur", "gradient_background", "gradient_tail", "long_tail", "midair", "pale_body", "pink_stripes", "purple_body", "riding", "simple_background", "skimpy", "slim_humanoid", "stripes", "tail", "tan_stripes", "white_fur", "white_nose"], "stage3_selected_scores": {"fur": 0.5962, "simple_background": 0.604, "tail": 0.6262, "blue_eyes": 0.6113, "white_fur": 0.6152, "stripes": 0.6216, "purple_body": 0.5754, "skimpy": 0.3825, "long_tail": 0.6362, "gradient_background": 0.5021, "canid_humanoid": 0.8514, "canine_humanoid": 0.8898, "blue_nose": 0.6093, "fox_humanoid": 0.81, "blurred_background": 0.5177, "riding": 0.3675, "blue_stripes": 0.6999, "midair": 0.4366, "white_nose": 0.5565, "pink_stripes": 0.7069, "fluffy_fur": 0.5831, "curved_tail": 0.7269, "gradient_tail": 0.5945, "pale_body": 0.4458, "slim_humanoid": 0.592, "tan_stripes": 0.6293}, "stage3_selected_ranks": {"fur": 29, "simple_background": 27, "tail": 16, "blue_eyes": 24, "white_fur": 21, "stripes": 18, "purple_body": 35, "skimpy": 69, "long_tail": 11, "gradient_background": 56, "canid_humanoid": 2, "canine_humanoid": 1, "blue_nose": 26, "fox_humanoid": 3, "blurred_background": 50, "riding": 72, "blue_stripes": 8, "midair": 67, "white_nose": 40, "pink_stripes": 7, "fluffy_fur": 34, "curved_tail": 6, "gradient_tail": 30, "pale_body": 66, "slim_humanoid": 31, "tan_stripes": 14}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "tail": 1, "blue_eyes": 1, "white_fur": 2, "stripes": 2, "purple_body": 3, "skimpy": 2, "long_tail": 1, "gradient_background": 3, "canid_humanoid": 2, "canine_humanoid": 1, "blue_nose": 1, "fox_humanoid": 3, "blurred_background": 3, "riding": 2, "blue_stripes": 1, "midair": 2, "white_nose": 2, "pink_stripes": 1, "fluffy_fur": 2, "curved_tail": 1, "gradient_tail": 3, "pale_body": 3, "slim_humanoid": 3, "tan_stripes": 3}, "extra_evidence": {"animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "blue_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6999}, "canid_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8514}, "canine_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8898}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "curved_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7269}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5831}, "fox_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.81}, "gradient_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5945}, "humanoid": {"source": "implied"}, "long_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6362}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "midair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4366}, "pale_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4458}, "pink_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7069}, "riding": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3675}, "skimpy": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3825}, "slim_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.592}, "stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6216}, "tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6262}, "tan_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6293}, "white_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5565}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["solo", "simple_background", "clothing", "canid", "anthro"], "t1": 1.11, "t2": 2.57, "t3": 4.82, "t3s": 1.11, "t3p": 2.24, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=73 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 56, "n_selected": 36, "n_implied": 13, "n_structural": 5, "n_probe": 5, "ret_R": 0.2727, "P": 0.3889, "R": 0.6364, "F1": 0.4828, "leaf_P": 0.1429, "leaf_R": 0.25, "leaf_F1": 0.1818, "n_leaf_sel": 21, "n_leaf_gt": 12, "ret_P": 0.1071, "sel_given_ret": 2.3333, "over_sel": 1.64, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 16, "attempts_by_n_local": {"57": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3889, "gen_R": 0.6364, "gen_F1": 0.4828, "missed": ["chest_tuft", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["bear", "belly", "countershade_belly", "countershade_body", "cross-hatching", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hatching_(art)", "light_chest", "looking_at_viewer", "muscular_legs", "pattern_background", "shaded", "striped_back", "striped_body", "striped_fur", "tan_bottomwear", "tan_clothing", "tan_shorts", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "bear", "belly", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_belly", "countershade_body", "countershading", "cross-hatching", "felid", "fur", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hand_on_head", "hatching_(art)", "light_chest", "looking_at_viewer", "male", "mammal", "muscular_legs", "pattern_background", "shaded", "shorts", "solo", "striped_back", "striped_body", "striped_fur", "stripes", "tan_bottomwear", "tan_clothing", "tan_shorts", "white_chest"], "stage3_selected": ["blue_eyes", "countershade_belly", "countershade_body", "cross-hatching", "glistening_eyes", "glistening_fur", "hand_on_head", "light_chest", "muscular_legs", "pattern_background", "shorts", "striped_back", "striped_body", "striped_fur", "tan_shorts", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5785, "shorts": 0.5914, "striped_body": 0.4159, "striped_fur": 0.6475, "hand_on_head": 0.6014, "glistening_eyes": 0.4769, "pattern_background": 0.5269, "glistening_fur": 0.501, "muscular_legs": 0.791, "white_chest": 0.917, "countershade_body": 0.8721, "striped_back": 0.7029, "countershade_belly": 0.828, "cross-hatching": 0.4762, "light_chest": 0.7491, "tan_shorts": 0.5498}, "stage3_selected_ranks": {"blue_eyes": 30, "shorts": 28, "striped_body": 55, "striped_fur": 16, "hand_on_head": 24, "glistening_eyes": 48, "pattern_background": 36, "glistening_fur": 41, "muscular_legs": 8, "white_chest": 2, "countershade_body": 3, "striped_back": 13, "countershade_belly": 6, "cross-hatching": 49, "light_chest": 11, "tan_shorts": 32}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "shorts": 1, "striped_body": 2, "striped_fur": 2, "hand_on_head": 2, "glistening_eyes": 3, "pattern_background": 1, "glistening_fur": 3, "muscular_legs": 2, "white_chest": 1, "countershade_body": 1, "striped_back": 2, "countershade_belly": 2, "cross-hatching": 3, "light_chest": 2, "tan_shorts": 2}, "extra_evidence": {"bear": {"source": "probe"}, "belly": {"source": "implied"}, "countershade_belly": {"source": "stage3", "why": "unknown", "retrieval_score": 0.828}, "countershade_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8721}, "cross-hatching": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4762}, "glistening": {"source": "implied"}, "glistening_body": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4769}, "glistening_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.501}, "hatching_(art)": {"source": "implied"}, "light_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7491}, "looking_at_viewer": {"source": "structural"}, "muscular_legs": {"source": "stage3", "why": "unknown", "retrieval_score": 0.791}, "pattern_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5269}, "shaded": {"source": "implied"}, "striped_back": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7029}, "striped_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4159}, "striped_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6475}, "tan_bottomwear": {"source": "implied"}, "tan_clothing": {"source": "implied"}, "tan_shorts": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5498}, "white_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.917}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["solo", "felid", "clothing", "bear", "anthro"], "t1": 1.9, "t2": 1.78, "t3": 2.98, "t3s": 1.08, "t3p": 1.38, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=57 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 50, "n_selected": 47, "n_implied": 16, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.2128, "R": 0.8333, "F1": 0.339, "leaf_P": 0.2083, "leaf_R": 0.5556, "leaf_F1": 0.303, "n_leaf_sel": 24, "n_leaf_gt": 9, "ret_P": 0.06, "sel_given_ret": 3.3333, "over_sel": 3.92, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 27, "attempts_by_n_local": {"53": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2128, "gen_R": 0.8333, "gen_F1": 0.339, "missed": ["alpha_channel", "fingers"], "extra": ["beverage", "black_body", "black_fur", "black_necktie", "bottomwear", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "coffee_mug", "dress_shirt", "formal", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "holding_beverage", "holding_mug", "holding_object", "mug", "necktie", "pants", "pockets", "serious", "shirt", "tan_bottomwear", "tan_clothing", "tan_pants", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_body", "white_fur", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beverage", "black_body", "black_fur", "black_necktie", "bottomwear", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "clothed", "clothing", "coffee_mug", "dress_shirt", "felid", "feline", "formal", "fur", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "holding_beverage", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "pants", "pockets", "serious", "shirt", "solo", "tan_bottomwear", "tan_clothing", "tan_pants", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_body", "white_fur", "white_necktie"], "stage3_selected": ["black_fur", "black_necktie", "brown_vest", "business_attire", "coffee_mug", "dress_shirt", "felid", "feline", "formal", "fur", "green_background", "grey_shirt", "hair_bun", "holding_beverage", "holding_mug", "invalid_background", "mug", "necktie", "pockets", "serious", "shirt", "simple_background", "tan_pants", "teal_shirt", "vest", "white_fur", "white_necktie"], "stage3_selected_scores": {"fur": 0.7146, "simple_background": 0.6978, "felid": 0.6418, "white_fur": 0.5834, "feline": 0.7062, "shirt": 0.7998, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8403, "green_background": 0.6069, "dress_shirt": 0.6132, "pockets": 0.6095, "hair_bun": 0.6926, "holding_beverage": 0.7721, "coffee_mug": 0.7055, "mug": 0.8841, "grey_shirt": 0.7582, "serious": 0.5823, "holding_mug": 0.916, "black_necktie": 0.7132, "tan_pants": 0.7373, "formal": 0.5993, "business_attire": 0.5657, "brown_vest": 0.8153, "teal_shirt": 0.7474, "white_necktie": 0.6418, "invalid_background": 0.6495}, "stage3_selected_ranks": {"fur": 20, "simple_background": 24, "felid": 27, "white_fur": 40, "feline": 22, "shirt": 5, "black_fur": 18, "necktie": 17, "vest": 3, "green_background": 37, "dress_shirt": 35, "pockets": 36, "hair_bun": 25, "holding_beverage": 8, "coffee_mug": 23, "mug": 2, "grey_shirt": 11, "serious": 41, "holding_mug": 1, "black_necktie": 21, "tan_pants": 16, "formal": 38, "business_attire": 43, "brown_vest": 4, "teal_shirt": 15, "white_necktie": 28, "invalid_background": 26}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "felid": 2, "white_fur": 3, "feline": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "vest": 1, "green_background": 2, "dress_shirt": 2, "pockets": 3, "hair_bun": 1, "holding_beverage": 3, "coffee_mug": 3, "mug": 1, "grey_shirt": 1, "serious": 2, "holding_mug": 1, "black_necktie": 2, "tan_pants": 3, "formal": 1, "business_attire": 1, "brown_vest": 1, "teal_shirt": 3, "white_necktie": 3, "invalid_background": 1}, "extra_evidence": {"beverage": {"source": "implied"}, "black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7183}, "black_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7132}, "bottomwear": {"source": "implied"}, "brown_clothing": {"source": "implied"}, "brown_topwear": {"source": "implied"}, "brown_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8153}, "business_attire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5657}, "coffee_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7055}, "dress_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6132}, "formal": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5993}, "green_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6069}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6926}, "holding_beverage": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7721}, "holding_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7314}, "pants": {"source": "implied"}, "pockets": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6095}, "serious": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5823}, "shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7998}, "tan_bottomwear": {"source": "implied"}, "tan_clothing": {"source": "implied"}, "tan_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7373}, "teal_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8403}, "white_body": {"source": "implied"}, "white_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5834}, "white_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["text", "solo", "felid", "clothing", "anthro"], "t1": 2.15, "t2": 1.43, "t3": 4.3, "t3s": 0.88, "t3p": 1.95, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=53 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 57, "n_selected": 29, "n_implied": 5, "n_structural": 6, "n_probe": 5, "ret_R": 0.5714, "P": 0.3793, "R": 0.7857, "F1": 0.5116, "leaf_P": 0.2, "leaf_R": 0.4, "leaf_F1": 0.2667, "n_leaf_sel": 20, "n_leaf_gt": 10, "ret_P": 0.1404, "sel_given_ret": 1.375, "over_sel": 2.07, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3793, "gen_R": 0.7857, "gen_F1": 0.5116, "missed": ["fur", "human", "male"], "extra": ["anthro", "body_hair", "bored_expression", "dancer_outfit", "duo", "feral", "flash", "grin", "leg_hair", "mischievous", "raised_arms", "red_hair", "smile", "smirk", "toony_expression", "topless", "trio", "wide_grin"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "body_hair", "bored_expression", "clothed", "clothing", "dancer_outfit", "dancing", "duo", "feral", "flash", "grin", "group", "hair", "haplorhine", "leg_hair", "looking_at_viewer", "mammal", "mischievous", "primate", "raised_arms", "red_hair", "smile", "smirk", "toony_expression", "topless", "trio", "wide_grin"], "stage3_selected": ["ape", "bear", "bored_expression", "dancer_outfit", "dancing", "flash", "grin", "hair", "leg_hair", "mischievous", "primate", "raised_arms", "red_hair", "simple_background", "smirk", "toony_expression", "wide_grin"], "stage3_selected_scores": {"hair": 0.5485, "simple_background": 0.5541, "red_hair": 0.3689, "bear": 0.5757, "grin": 0.5711, "smirk": 0.3664, "primate": 0.8911, "dancing": 0.562, "ape": 0.9769, "raised_arms": 0.551, "leg_hair": 0.3824, "flash": 0.3227, "mischievous": 0.545, "bored_expression": 0.4389, "dancer_outfit": 0.4203, "toony_expression": 0.4737, "wide_grin": 0.5312}, "stage3_selected_ranks": {"hair": 14, "simple_background": 11, "red_hair": 52, "bear": 6, "grin": 8, "smirk": 53, "primate": 2, "dancing": 10, "ape": 1, "raised_arms": 13, "leg_hair": 48, "flash": 58, "mischievous": 15, "bored_expression": 34, "dancer_outfit": 38, "toony_expression": 26, "wide_grin": 17}, "stage3_selected_phrase_ranks": {"hair": 1, "simple_background": 1, "red_hair": 3, "bear": 1, "grin": 1, "smirk": 3, "primate": 1, "dancing": 1, "ape": 1, "raised_arms": 1, "leg_hair": 1, "flash": 2, "mischievous": 1, "bored_expression": 3, "dancer_outfit": 3, "toony_expression": 1, "wide_grin": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "body_hair": {"source": "implied"}, "bored_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4389}, "dancer_outfit": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4203}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "flash": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3227}, "grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5711}, "leg_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3824}, "mischievous": {"source": "stage3", "why": "unknown", "retrieval_score": 0.545}, "raised_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.551}, "red_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3689}, "smile": {"source": "implied"}, "smirk": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3664}, "toony_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4737}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5312}}, "structural": ["trio", "anthro", "feral", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "group", "duo", "bear", "anthro"], "t1": 3.55, "t2": 2.0, "t3": 6.31, "t3s": 1.45, "t3p": 1.45, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=60 entity=0 copyright_filtered=1 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/k_sweep_explicit_no_why_seed42_k4.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T05:59:49.506942", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 4, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 52, "n_selected": 47, "n_implied": 20, "n_structural": 3, "n_probe": 4, "ret_R": 0.2727, "P": 0.3617, "R": 0.7727, "F1": 0.4928, "leaf_P": 0.2174, "leaf_R": 0.3846, "leaf_F1": 0.2778, "n_leaf_sel": 23, "n_leaf_gt": 13, "ret_P": 0.1154, "sel_given_ret": 2.8333, "over_sel": 2.14, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 22, "attempts_by_n_local": {"55": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3617, "gen_R": 0.7727, "gen_F1": 0.4928, "missed": ["fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["4_claws", "5_claws", "<3", "arctic_wolf", "black_hair", "bottomwear", "campfire", "canis", "demon", "denim", "denim_clothing", "determined", "electric_guitar", "flowing_hair", "jeans", "membrane_(anatomy)", "membranous_wings", "notched_ear", "pants", "pastel_background", "playing_guitar", "playing_music", "succubus", "t-pose", "tire", "torn_bottomwear", "torn_jeans", "torn_pants", "wings", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["4_claws", "5_claws", "<3", "anthro", "arctic_wolf", "bass_guitar", "black_hair", "bottomwear", "campfire", "canid", "canine", "canis", "claws", "clothed", "clothing", "demon", "denim", "denim_clothing", "determined", "electric_guitar", "flowing_hair", "guitar", "hair", "jeans", "mammal", "membrane_(anatomy)", "membranous_wings", "musical_instrument", "notched_ear", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "succubus", "t-pose", "tail", "tire", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "wings", "wolf"], "stage3_selected": ["4_claws", "5_claws", "arctic_wolf", "bass_guitar", "black_hair", "campfire", "claws", "demon", "determined", "electric_guitar", "flowing_hair", "guitar", "membranous_wings", "notched_ear", "pastel_background", "playing_guitar", "spade_tail", "succubus", "t-pose", "tire", "torn_bottomwear", "torn_jeans"], "stage3_selected_scores": {"claws": 0.5684, "black_hair": 0.3899, "membranous_wings": 0.4106, "demon": 0.4008, "spade_tail": 0.618, "notched_ear": 0.4315, "torn_bottomwear": 0.4362, "guitar": 0.9623, "succubus": 0.3867, "campfire": 0.4496, "arctic_wolf": 0.4908, "playing_guitar": 0.9317, "torn_jeans": 0.4824, "tire": 0.4151, "electric_guitar": 0.8664, "bass_guitar": 0.9118, "flowing_hair": 0.5669, "4_claws": 0.4516, "determined": 0.4471, "t-pose": 0.5519, "5_claws": 0.4601, "pastel_background": 0.5632}, "stage3_selected_ranks": {"claws": 11, "black_hair": 54, "membranous_wings": 49, "demon": 51, "spade_tail": 6, "notched_ear": 41, "torn_bottomwear": 39, "guitar": 1, "succubus": 55, "campfire": 36, "arctic_wolf": 29, "playing_guitar": 2, "torn_jeans": 31, "tire": 48, "electric_guitar": 5, "bass_guitar": 3, "flowing_hair": 12, "4_claws": 35, "determined": 38, "t-pose": 16, "5_claws": 33, "pastel_background": 14}, "stage3_selected_phrase_ranks": {"claws": 1, "black_hair": 4, "membranous_wings": 2, "demon": 3, "spade_tail": 1, "notched_ear": 4, "torn_bottomwear": 3, "guitar": 1, "succubus": 4, "campfire": 2, "arctic_wolf": 2, "playing_guitar": 1, "torn_jeans": 1, "tire": 4, "electric_guitar": 4, "bass_guitar": 2, "flowing_hair": 1, "4_claws": 4, "determined": 3, "t-pose": 2, "5_claws": 3, "pastel_background": 1}, "extra_evidence": {"4_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4516}, "5_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4601}, "<3": {"source": "probe"}, "arctic_wolf": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4908}, "black_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3899}, "bottomwear": {"source": "implied"}, "campfire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4496}, "canis": {"source": "implied"}, "demon": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4008}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "determined": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4471}, "electric_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8664}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5669}, "jeans": {"source": "implied"}, "membrane_(anatomy)": {"source": "implied"}, "membranous_wings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4106}, "notched_ear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4315}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5632}, "playing_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9317}, "playing_music": {"source": "implied"}, "succubus": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3867}, "t-pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5519}, "tire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4151}, "torn_bottomwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4362}, "torn_jeans": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4824}, "torn_pants": {"source": "implied"}, "wings": {"source": "implied"}, "wolf": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "canid", "anthro", "<3"], "t1": 1.9, "t2": 2.52, "t3": 5.63, "t3s": 3.71, "t3p": 3.22, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=55 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 53, "n_selected": 31, "n_implied": 4, "n_structural": 4, "n_probe": 3, "ret_R": 0.75, "P": 0.0968, "R": 0.75, "F1": 0.1714, "leaf_P": 0.1111, "leaf_R": 0.75, "leaf_F1": 0.1935, "n_leaf_sel": 27, "n_leaf_gt": 4, "ret_P": 0.0566, "sel_given_ret": 1.0, "over_sel": 7.75, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 23, "attempts_by_n_local": {"54": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.0968, "gen_R": 0.75, "gen_F1": 0.1714, "missed": ["smile"], "extra": ["ambiguous_gender", "bear", "black_eyelids", "black_inner_ear", "covering", "covering_crotch", "covering_face", "eye_spots", "feral", "floating_hands", "floating_head", "full-length_portrait", "glistening", "glistening_eyes", "jagged_mouth", "light_nose", "looking_away", "mammal", "mouth_closed", "no_irises", "nude", "portrait", "round_eyes", "round_nose", "spots", "spotted_back", "toony", "yellow_background"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "bear", "black_eyelids", "black_inner_ear", "covering", "covering_crotch", "covering_face", "eye_spots", "feral", "floating_hands", "floating_head", "full-length_portrait", "glistening", "glistening_eyes", "jagged_mouth", "light_nose", "looking_away", "mammal", "mouth_closed", "no_irises", "nude", "portrait", "red_nose", "round_eyes", "round_nose", "solo", "spots", "spotted_back", "tan_body", "toony", "yellow_background"], "stage3_selected": ["black_eyelids", "black_inner_ear", "covering_crotch", "covering_face", "eye_spots", "floating_hands", "floating_head", "full-length_portrait", "glistening_eyes", "jagged_mouth", "light_nose", "looking_away", "mouth_closed", "no_irises", "red_nose", "round_eyes", "round_nose", "spots", "spotted_back", "tan_body", "toony", "white_background", "yellow_background"], "stage3_selected_scores": {"white_background": 0.6356, "tan_body": 0.6834, "spots": 0.6374, "full-length_portrait": 0.4759, "toony": 0.6426, "looking_away": 0.5307, "glistening_eyes": 0.5244, "mouth_closed": 0.662, "red_nose": 0.7489, "yellow_background": 0.5688, "covering_crotch": 0.4463, "black_inner_ear": 0.6379, "covering_face": 0.4644, "floating_hands": 0.4635, "no_irises": 0.5909, "floating_head": 0.5049, "jagged_mouth": 0.5874, "light_nose": 0.6896, "round_eyes": 0.8869, "black_eyelids": 0.6551, "eye_spots": 0.7021, "spotted_back": 0.7237, "round_nose": 0.5839}, "stage3_selected_ranks": {"white_background": 27, "tan_body": 13, "spots": 26, "full-length_portrait": 51, "toony": 24, "looking_away": 42, "glistening_eyes": 43, "mouth_closed": 19, "red_nose": 4, "yellow_background": 37, "covering_crotch": 56, "black_inner_ear": 25, "covering_face": 54, "floating_hands": 55, "no_irises": 33, "floating_head": 47, "jagged_mouth": 34, "light_nose": 12, "round_eyes": 2, "black_eyelids": 21, "eye_spots": 11, "spotted_back": 7, "round_nose": 35}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 4, "spots": 4, "full-length_portrait": 4, "toony": 1, "looking_away": 4, "glistening_eyes": 4, "mouth_closed": 2, "red_nose": 1, "yellow_background": 2, "covering_crotch": 4, "black_inner_ear": 4, "covering_face": 3, "floating_hands": 4, "no_irises": 3, "floating_head": 2, "jagged_mouth": 4, "light_nose": 2, "round_eyes": 1, "black_eyelids": 4, "eye_spots": 3, "spotted_back": 2, "round_nose": 4}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "bear": {"source": "probe"}, "black_eyelids": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6551}, "black_inner_ear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6379}, "covering": {"source": "implied"}, "covering_crotch": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4463}, "covering_face": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4644}, "eye_spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7021}, "feral": {"source": "structural"}, "floating_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4635}, "floating_head": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5049}, "full-length_portrait": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4759}, "glistening": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5244}, "jagged_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5874}, "light_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6896}, "looking_away": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5307}, "mammal": {"source": "implied"}, "mouth_closed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.662}, "no_irises": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5909}, "nude": {"source": "structural"}, "portrait": {"source": "implied"}, "round_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8869}, "round_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5839}, "spots": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6374}, "spotted_back": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7237}, "toony": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6426}, "yellow_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5688}}, "structural": ["solo", "feral", "ambiguous_gender", "nude"], "probe": ["solo", "simple_background", "bear"], "t1": 2.73, "t2": 1.16, "t3": 1.28, "t3s": 0.66, "t3p": 1.43, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=54 entity=0 copyright_filtered=2 generic_char_to_general=0 unknown_type=5"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 55, "n_selected": 28, "n_implied": 11, "n_structural": 4, "n_probe": 4, "ret_R": 0.7143, "P": 0.4286, "R": 0.8571, "F1": 0.5714, "leaf_P": 0.4667, "leaf_R": 0.7778, "leaf_F1": 0.5833, "n_leaf_sel": 15, "n_leaf_gt": 9, "ret_P": 0.1818, "sel_given_ret": 1.2, "over_sel": 2.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 12, "attempts_by_n_local": {"55": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4286, "gen_R": 0.8571, "gen_F1": 0.5714, "missed": ["romantic", "romantic_couple"], "extra": ["<3", "coat", "diaper", "heterochromia", "holding_object", "holding_plushie", "looking_at_viewer", "pull-ups_(diaper)", "red_clothing", "red_coat", "red_topwear", "topwear", "white_clothing", "white_coat", "white_topwear", "wide_eyed"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "diaper", "duo", "heterochromia", "holding_object", "holding_plushie", "lagomorph", "leporid", "looking_at_viewer", "mammal", "plushie", "pull-ups_(diaper)", "rabbit", "red_clothing", "red_coat", "red_topwear", "teal_eyes", "topwear", "white_clothing", "white_coat", "white_topwear", "wide_eyed"], "stage3_selected": ["blue_eyes", "blush", "coat", "heterochromia", "holding_plushie", "leporid", "pull-ups_(diaper)", "rabbit", "red_coat", "teal_eyes", "white_coat", "wide_eyed"], "stage3_selected_scores": {"blush": 0.6084, "blue_eyes": 0.6154, "leporid": 0.5313, "rabbit": 0.5941, "heterochromia": 0.4304, "coat": 0.6386, "wide_eyed": 0.4619, "teal_eyes": 0.6285, "holding_plushie": 0.7794, "white_coat": 0.5255, "pull-ups_(diaper)": 0.5206, "red_coat": 0.5209}, "stage3_selected_ranks": {"blush": 13, "blue_eyes": 12, "leporid": 26, "rabbit": 14, "heterochromia": 51, "coat": 7, "wide_eyed": 41, "teal_eyes": 8, "holding_plushie": 2, "white_coat": 29, "pull-ups_(diaper)": 31, "red_coat": 30}, "stage3_selected_phrase_ranks": {"blush": 1, "blue_eyes": 1, "leporid": 3, "rabbit": 1, "heterochromia": 4, "coat": 1, "wide_eyed": 4, "teal_eyes": 1, "holding_plushie": 1, "white_coat": 3, "pull-ups_(diaper)": 2, "red_coat": 4}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6386}, "diaper": {"source": "implied"}, "heterochromia": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4304}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7794}, "looking_at_viewer": {"source": "structural"}, "pull-ups_(diaper)": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5206}, "red_clothing": {"source": "implied"}, "red_coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5209}, "red_topwear": {"source": "implied"}, "topwear": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_coat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5255}, "white_topwear": {"source": "implied"}, "wide_eyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4619}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["duo", "blush", "anthro", "<3"], "t1": 1.65, "t2": 1.19, "t3": 4.13, "t3s": 1.49, "t3p": 1.11, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=55 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 62, "n_selected": 72, "n_implied": 25, "n_structural": 4, "n_probe": 7, "ret_R": 0.56, "P": 0.3194, "R": 0.92, "F1": 0.4742, "leaf_P": 0.1944, "leaf_R": 0.4667, "leaf_F1": 0.2745, "n_leaf_sel": 36, "n_leaf_gt": 15, "ret_P": 0.2258, "sel_given_ret": 1.6429, "over_sel": 2.88, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 3, "dupe_indices_total": 0, "kept_total": 39, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "3": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3194, "gen_R": 0.92, "gen_F1": 0.4742, "missed": ["looking_at_another", "standing"], "extra": ["4_claws", "arms_out", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "blush", "brown_clothing", "brown_shirt", "brown_topwear", "buckteeth", "cheek_markings", "cross_fox", "eye_markings", "felid", "gloves_(marking)", "grey_bottomwear", "grey_clothing", "grey_pants", "grey_shirt", "grey_topwear", "hand_in_pocket", "long_arms", "looking_at_viewer", "marble_fox", "one_eye_half-closed", "open_mouth", "open_smile", "pockets", "rabbit_ears", "red_fox", "smile", "snout", "snout_markings", "tail", "tail_markings", "tan_body", "tan_fur", "teeth", "undershirt", "white_body", "white_clothing", "white_fur", "white_shirt", "white_topwear", "wide_eyed"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["4_claws", "anthro", "arms_out", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "blush", "bottomwear", "brown_clothing", "brown_shirt", "brown_topwear", "buckteeth", "canid", "canine", "cheek_markings", "claws", "clothed", "clothing", "cross_fox", "crossed_arms", "duo", "eye_markings", "facial_markings", "felid", "fox", "fur", "gloves_(marking)", "grey_background", "grey_bottomwear", "grey_clothing", "grey_pants", "grey_shirt", "grey_topwear", "hand_in_pocket", "head_markings", "lagomorph", "leporid", "long_arms", "looking_at_viewer", "mammal", "marble_fox", "markings", "one_eye_half-closed", "open_mouth", "open_smile", "overalls", "pants", "pockets", "rabbit", "rabbit_ears", "red_fox", "shirt", "smile", "snout", "snout_markings", "tail", "tail_markings", "tan_body", "tan_fur", "teeth", "topwear", "undershirt", "white_body", "white_clothing", "white_fur", "white_shirt", "white_topwear", "wide_eyed"], "stage3_selected": ["4_claws", "arms_out", "black_pants", "blue_overalls", "blue_shirt", "brown_shirt", "buckteeth", "cheek_markings", "claws", "cross_fox", "crossed_arms", "eye_markings", "facial_markings", "fox", "fur", "gloves_(marking)", "grey_background", "grey_pants", "grey_shirt", "hand_in_pocket", "lagomorph", "leporid", "long_arms", "marble_fox", "one_eye_half-closed", "open_mouth", "open_smile", "overalls", "rabbit", "rabbit_ears", "shirt", "snout_markings", "tail_markings", "tan_fur", "undershirt", "white_fur", "white_shirt", "white_topwear", "wide_eyed"], "stage3_selected_scores": {"fur": 0.6548, "open_mouth": 0.6344, "claws": 0.6317, "white_fur": 0.5166, "fox": 0.6393, "shirt": 0.7497, "lagomorph": 0.5947, "leporid": 0.5837, "rabbit": 0.6521, "tan_fur": 0.5207, "tail_markings": 0.6221, "open_smile": 0.5285, "grey_background": 0.6797, "gloves_(marking)": 0.6278, "buckteeth": 0.5324, "facial_markings": 0.6956, "crossed_arms": 0.7298, "wide_eyed": 0.4682, "white_topwear": 0.768, "white_shirt": 0.8206, "overalls": 0.8782, "black_pants": 0.8338, "blue_shirt": 0.7663, "hand_in_pocket": 0.5675, "eye_markings": 0.637, "snout_markings": 0.6224, "grey_shirt": 0.693, "grey_pants": 0.7578, "undershirt": 0.7074, "rabbit_ears": 0.6003, "cross_fox": 0.4701, "one_eye_half-closed": 0.4534, "brown_shirt": 0.7778, "cheek_markings": 0.6222, "4_claws": 0.5925, "long_arms": 0.5862, "marble_fox": 0.5584, "arms_out": 0.5673, "blue_overalls": 0.9205}, "stage3_selected_ranks": {"fur": 22, "open_mouth": 26, "claws": 28, "white_fur": 60, "fox": 24, "shirt": 11, "lagomorph": 39, "leporid": 43, "rabbit": 23, "tan_fur": 59, "tail_markings": 32, "open_smile": 58, "grey_background": 19, "gloves_(marking)": 29, "buckteeth": 56, "facial_markings": 16, "crossed_arms": 13, "wide_eyed": 63, "white_topwear": 6, "white_shirt": 4, "overalls": 2, "black_pants": 3, "blue_shirt": 7, "hand_in_pocket": 47, "eye_markings": 25, "snout_markings": 30, "grey_shirt": 17, "grey_pants": 9, "undershirt": 15, "rabbit_ears": 36, "cross_fox": 62, "one_eye_half-closed": 64, "brown_shirt": 5, "cheek_markings": 31, "4_claws": 40, "long_arms": 42, "marble_fox": 50, "arms_out": 48, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "white_fur": 4, "fox": 1, "shirt": 1, "lagomorph": 3, "leporid": 4, "rabbit": 1, "tan_fur": 3, "tail_markings": 4, "open_smile": 2, "grey_background": 1, "gloves_(marking)": 3, "buckteeth": 3, "facial_markings": 1, "crossed_arms": 1, "wide_eyed": 3, "white_topwear": 3, "white_shirt": 1, "overalls": 1, "black_pants": 1, "blue_shirt": 3, "hand_in_pocket": 3, "eye_markings": 2, "snout_markings": 3, "grey_shirt": 4, "grey_pants": 2, "undershirt": 3, "rabbit_ears": 1, "cross_fox": 4, "one_eye_half-closed": 4, "brown_shirt": 2, "cheek_markings": 4, "4_claws": 3, "long_arms": 2, "marble_fox": 2, "arms_out": 3, "blue_overalls": 1}, "extra_evidence": {"4_claws": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5925}, "arms_out": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5673}, "black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8338}, "blue_clothing": {"source": "implied"}, "blue_overalls": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9205}, "blue_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7663}, "blue_topwear": {"source": "implied"}, "blush": {"source": "probe"}, "brown_clothing": {"source": "implied"}, "brown_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7778}, "brown_topwear": {"source": "implied"}, "buckteeth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5324}, "cheek_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6222}, "cross_fox": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4701}, "eye_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.637}, "felid": {"source": "probe"}, "gloves_(marking)": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6278}, "grey_bottomwear": {"source": "implied"}, "grey_clothing": {"source": "implied"}, "grey_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7578}, "grey_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.693}, "grey_topwear": {"source": "implied"}, "hand_in_pocket": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5675}, "long_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5862}, "looking_at_viewer": {"source": "structural"}, "marble_fox": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5584}, "one_eye_half-closed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4534}, "open_mouth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6344}, "open_smile": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5285}, "pockets": {"source": "implied"}, "rabbit_ears": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6003}, "red_fox": {"source": "implied"}, "smile": {"source": "implied"}, "snout": {"source": "implied"}, "snout_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6224}, "tail": {"source": "implied"}, "tail_markings": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6221}, "tan_body": {"source": "implied"}, "tan_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5207}, "teeth": {"source": "implied"}, "undershirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7074}, "white_body": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5166}, "white_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8206}, "white_topwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.768}, "wide_eyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4682}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "felid", "duo", "clothing", "canid", "blush", "anthro"], "t1": 1.85, "t2": 1.4, "t3": 11.42, "t3s": 1.32, "t3p": 2.01, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=63 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 77, "n_selected": 65, "n_implied": 10, "n_structural": 5, "n_probe": 5, "ret_R": 0.4615, "P": 0.1538, "R": 0.7692, "F1": 0.2564, "leaf_P": 0.0408, "leaf_R": 0.3333, "leaf_F1": 0.0727, "n_leaf_sel": 49, "n_leaf_gt": 6, "ret_P": 0.0779, "sel_given_ret": 1.6667, "over_sel": 5.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 46, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "14": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1538, "gen_R": 0.7692, "gen_F1": 0.2564, "missed": ["fur", "white_body", "white_fur"], "extra": ["2_panel_comic", "3_panel_comic", "<3", "agamid", "anthro", "bear", "black_speech_bubble", "bodily_fluids", "border", "bubble", "clothed", "clothing", "comic_panel", "dark_theme", "darkness", "defeated", "domestic_goat", "duo", "evil_look", "face_mask", "felid", "flask", "frilled_lizard", "gecko", "goo_creature", "group", "human_only", "hunched_over", "iguanid", "light", "lying_on_ground", "male_human", "medieval", "medieval_fantasy", "noseless", "not_furry", "note", "note_pad", "on_ground", "patchwork_creature", "rubble", "snot", "snot_bubble", "solo", "speech_bubble", "standing_over", "striped_body", "stripes", "text_message", "thought_bubble", "threatening", "topwear", "torch", "unknown_species", "vest"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["2_panel_comic", "3_panel_comic", "<3", "agamid", "anthro", "bear", "black_speech_bubble", "bodily_fluids", "border", "bovid", "bubble", "caprine", "clothed", "clothing", "comic_panel", "dark_theme", "darkness", "defeated", "dialogue", "domestic_goat", "duo", "evil_look", "face_mask", "felid", "flask", "frilled_lizard", "gecko", "goat", "goo_creature", "group", "human", "human_only", "hunched_over", "iguanid", "light", "lizard", "lying_on_ground", "male_human", "mammal", "medieval", "medieval_fantasy", "noseless", "not_furry", "note", "note_pad", "on_ground", "patchwork_creature", "reptile", "rubble", "scalie", "snot", "snot_bubble", "solo", "speech_bubble", "standing_over", "striped_body", "stripes", "text", "text_message", "thought_bubble", "threatening", "topwear", "torch", "unknown_species", "vest"], "stage3_selected": ["2_panel_comic", "3_panel_comic", "black_speech_bubble", "border", "bovid", "bubble", "caprine", "comic_panel", "dark_theme", "darkness", "defeated", "dialogue", "domestic_goat", "evil_look", "face_mask", "flask", "frilled_lizard", "gecko", "goat", "goo_creature", "human", "human_only", "hunched_over", "iguanid", "light", "lizard", "lying_on_ground", "male_human", "medieval", "medieval_fantasy", "noseless", "note", "note_pad", "on_ground", "patchwork_creature", "rubble", "snot_bubble", "speech_bubble", "standing_over", "striped_body", "text_message", "thought_bubble", "threatening", "torch", "unknown_species", "vest"], "stage3_selected_scores": {"dialogue": 0.6426, "human": 0.669, "speech_bubble": 0.7584, "bovid": 0.6057, "caprine": 0.638, "border": 0.5087, "lizard": 0.839, "striped_body": 0.5492, "goat": 0.7768, "light": 0.7793, "unknown_species": 0.7697, "on_ground": 0.674, "goo_creature": 0.5154, "vest": 0.502, "thought_bubble": 0.6581, "bubble": 0.7508, "human_only": 0.5271, "noseless": 0.4577, "gecko": 0.6408, "torch": 0.5677, "defeated": 0.6174, "lying_on_ground": 0.7947, "face_mask": 0.5493, "iguanid": 0.6016, "medieval": 0.5307, "comic_panel": 0.6176, "threatening": 0.5625, "darkness": 0.8329, "dark_theme": 0.5945, "note": 0.7399, "domestic_goat": 0.604, "text_message": 0.5644, "flask": 0.5338, "rubble": 0.6096, "black_speech_bubble": 0.6325, "patchwork_creature": 0.6123, "hunched_over": 0.5729, "frilled_lizard": 0.675, "standing_over": 0.7647, "3_panel_comic": 0.6265, "snot_bubble": 0.612, "evil_look": 0.5665, "2_panel_comic": 0.6184, "note_pad": 0.558, "medieval_fantasy": 0.5207, "male_human": 0.5565}, "stage3_selected_ranks": {"dialogue": 22, "human": 18, "speech_bubble": 8, "bovid": 36, "caprine": 24, "border": 71, "lizard": 1, "striped_body": 60, "goat": 5, "light": 4, "unknown_species": 6, "on_ground": 16, "goo_creature": 69, "vest": 74, "thought_bubble": 19, "bubble": 9, "human_only": 65, "noseless": 76, "gecko": 23, "torch": 48, "defeated": 30, "lying_on_ground": 3, "face_mask": 59, "iguanid": 38, "medieval": 64, "comic_panel": 29, "threatening": 53, "darkness": 2, "dark_theme": 41, "note": 10, "domestic_goat": 37, "text_message": 52, "flask": 62, "rubble": 35, "black_speech_bubble": 25, "patchwork_creature": 32, "hunched_over": 46, "frilled_lizard": 15, "standing_over": 7, "3_panel_comic": 27, "snot_bubble": 33, "evil_look": 49, "2_panel_comic": 28, "note_pad": 56, "medieval_fantasy": 67, "male_human": 57}, "stage3_selected_phrase_ranks": {"dialogue": 3, "human": 1, "speech_bubble": 1, "bovid": 3, "caprine": 2, "border": 3, "lizard": 1, "striped_body": 4, "goat": 1, "light": 1, "unknown_species": 1, "on_ground": 3, "goo_creature": 3, "vest": 3, "thought_bubble": 2, "bubble": 1, "human_only": 3, "noseless": 4, "gecko": 3, "torch": 3, "defeated": 2, "lying_on_ground": 1, "face_mask": 1, "iguanid": 4, "medieval": 4, "comic_panel": 4, "threatening": 4, "darkness": 1, "dark_theme": 3, "note": 1, "domestic_goat": 4, "text_message": 3, "flask": 2, "rubble": 4, "black_speech_bubble": 4, "patchwork_creature": 2, "hunched_over": 4, "frilled_lizard": 2, "standing_over": 1, "3_panel_comic": 2, "snot_bubble": 2, "evil_look": 4, "2_panel_comic": 3, "note_pad": 4, "medieval_fantasy": 4, "male_human": 2}, "extra_evidence": {"2_panel_comic": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6184}, "3_panel_comic": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6265}, "<3": {"source": "probe"}, "agamid": {"source": "implied"}, "anthro": {"source": "probe"}, "bear": {"source": "probe"}, "black_speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6325}, "bodily_fluids": {"source": "implied"}, "border": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5087}, "bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7508}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "comic_panel": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6176}, "dark_theme": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5945}, "darkness": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8329}, "defeated": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6174}, "domestic_goat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.604}, "duo": {"source": "structural"}, "evil_look": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5665}, "face_mask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5493}, "felid": {"source": "probe"}, "flask": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5338}, "frilled_lizard": {"source": "stage3", "why": "unknown", "retrieval_score": 0.675}, "gecko": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6408}, "goo_creature": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5154}, "group": {"source": "structural"}, "human_only": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5271}, "hunched_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5729}, "iguanid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6016}, "light": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7793}, "lying_on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7947}, "male_human": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5565}, "medieval": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5307}, "medieval_fantasy": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5207}, "noseless": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4577}, "not_furry": {"source": "implied"}, "note": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7399}, "note_pad": {"source": "stage3", "why": "unknown", "retrieval_score": 0.558}, "on_ground": {"source": "stage3", "why": "unknown", "retrieval_score": 0.674}, "patchwork_creature": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6123}, "rubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6096}, "snot": {"source": "implied"}, "snot_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.612}, "solo": {"source": "structural"}, "speech_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7584}, "standing_over": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7647}, "striped_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5492}, "stripes": {"source": "implied"}, "text_message": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5644}, "thought_bubble": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6581}, "threatening": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5625}, "topwear": {"source": "implied"}, "torch": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5677}, "unknown_species": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7697}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.502}}, "structural": ["solo", "duo", "group", "clothed", "text"], "probe": ["group", "felid", "bear", "anthro", "<3"], "t1": 3.39, "t2": 1.61, "t3": 11.09, "t3s": 1.45, "t3p": 2.12, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=74 entity=1 copyright_filtered=2 generic_char_to_general=0 unknown_type=1"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 94, "n_selected": 67, "n_implied": 11, "n_structural": 3, "n_probe": 6, "ret_R": 0.7333, "P": 0.194, "R": 0.8667, "F1": 0.3171, "leaf_P": 0.1509, "leaf_R": 0.6667, "leaf_F1": 0.2462, "n_leaf_sel": 53, "n_leaf_gt": 12, "ret_P": 0.117, "sel_given_ret": 1.1818, "over_sel": 4.47, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 53, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "37": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 1, "char_F1": 0.0, "gen_P": 0.197, "gen_R": 0.8667, "gen_F1": 0.321, "missed": ["angry", "eyes_closed"], "extra": ["2_frame_animation", "<3", "accessory", "animated", "animated_png", "anime_eyes", "annoyed", "annoyed_expression", "anthro", "applying_makeup", "background_character", "bedroom", "big_eyes", "blush", "character_request", "clothing", "color_swatch", "contest", "curtains_open", "distracting_watermark", "english_text", "eyes", "font", "hair_accessory", "hair_sticks", "highlights_(coloring)", "humanoid", "lipstick", "long_hair", "looking_away", "lying_on_bed", "membrane_(anatomy)", "model_sheet", "name_tag", "on_bed", "pajamas", "palette", "path_lines", "personal_grooming", "playful", "purple_eyes", "purple_hands", "purple_highlights", "purple_membrane", "resting", "restricted_palette", "romantic", "romantic_ambiance", "scene_kid", "scenery", "scenery_porn", "sleeping_together", "stated_age", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["2_frame_animation", "<3", "accessory", "animated", "animated_png", "anime_eyes", "annoyed", "annoyed_expression", "anthro", "applying_makeup", "background_character", "bed", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "blush", "character_request", "clothing", "color_swatch", "contest", "curtains_open", "distracting_watermark", "duo", "english_text", "eyes", "eyeshadow", "font", "furniture", "green_eyes", "hair", "hair_accessory", "hair_sticks", "highlights_(coloring)", "humanoid", "lipstick", "long_hair", "looking_away", "lying", "lying_on_bed", "makeup", "membrane_(anatomy)", "model_sheet", "name_tag", "on_bed", "pajamas", "palette", "path_lines", "personal_grooming", "playful", "purple_eyes", "purple_hair", "purple_hands", "purple_highlights", "purple_membrane", "resting", "restricted_palette", "romantic", "romantic_ambiance", "scene_kid", "scenery", "scenery_porn", "sleeping", "sleeping_together", "stated_age", "text", "watermark"], "stage3_selected": ["2_frame_animation", "animated", "animated_png", "anime_eyes", "annoyed", "annoyed_expression", "applying_makeup", "background_character", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "blurred_background", "character_request", "color_swatch", "contest", "curtains_open", "distracting_watermark", "duo", "english_text", "eyes", "eyeshadow", "font", "green_eyes", "hair", "hair_sticks", "lipstick", "long_hair", "looking_away", "lying", "lying_on_bed", "makeup", "model_sheet", "name_tag", "pajamas", "palette", "path_lines", "playful", "purple_eyes", "purple_hair", "purple_hands", "purple_highlights", "purple_membrane", "resting", "restricted_palette", "romantic_ambiance", "scene_kid", "scenery", "scenery_porn", "sleeping", "sleeping_together", "stated_age", "text"], "stage3_selected_scores": {"hair": 0.6035, "duo": 0.4376, "text": 0.6011, "blue_eyes": 0.6018, "lying": 0.4498, "green_eyes": 0.5992, "long_hair": 0.459, "blonde_hair": 0.599, "purple_eyes": 0.4336, "purple_hair": 0.5644, "makeup": 0.5968, "eyeshadow": 0.4766, "lipstick": 0.4877, "bedroom": 0.4904, "sleeping": 0.6031, "model_sheet": 0.4234, "looking_away": 0.4294, "big_eyes": 0.4292, "annoyed": 0.5731, "blurred_background": 0.4119, "romantic_ambiance": 0.4804, "restricted_palette": 0.4771, "lying_on_bed": 0.4097, "color_swatch": 0.4617, "distracting_watermark": 0.5001, "pajamas": 0.3756, "playful": 0.4466, "scenery": 0.4938, "purple_highlights": 0.4302, "name_tag": 0.3238, "character_request": 0.3755, "path_lines": 0.4129, "resting": 0.5146, "background_character": 0.3893, "annoyed_expression": 0.7254, "curtains_open": 0.4194, "sleeping_together": 0.5087, "scene_kid": 0.4097, "anime_eyes": 0.4409, "hair_sticks": 0.5301, "stated_age": 0.4306, "palette": 0.6685, "contest": 0.3494, "purple_hands": 0.5399, "scenery_porn": 0.4297, "purple_membrane": 0.5791, "applying_makeup": 0.4732, "font": 0.5303, "eyes": 0.8951, "english_text": 0.4193, "animated_png": 0.4721, "animated": 0.3974, "2_frame_animation": 0.4462}, "stage3_selected_ranks": {"hair": 5, "duo": 56, "text": 8, "blue_eyes": 7, "lying": 47, "green_eyes": 9, "long_hair": 43, "blonde_hair": 10, "purple_eyes": 57, "purple_hair": 14, "makeup": 11, "eyeshadow": 37, "lipstick": 34, "bedroom": 33, "sleeping": 6, "model_sheet": 67, "looking_away": 62, "big_eyes": 63, "annoyed": 13, "blurred_background": 73, "romantic_ambiance": 35, "restricted_palette": 36, "lying_on_bed": 74, "color_swatch": 42, "distracting_watermark": 29, "pajamas": 88, "playful": 49, "scenery": 31, "purple_highlights": 60, "name_tag": 96, "character_request": 89, "path_lines": 72, "resting": 24, "background_character": 83, "annoyed_expression": 2, "curtains_open": 69, "sleeping_together": 25, "scene_kid": 75, "anime_eyes": 53, "hair_sticks": 22, "stated_age": 58, "palette": 3, "contest": 93, "purple_hands": 19, "scenery_porn": 61, "purple_membrane": 12, "applying_makeup": 39, "font": 21, "eyes": 1, "english_text": 70, "animated_png": 40, "animated": 79, "2_frame_animation": 50}, "stage3_selected_phrase_ranks": {"hair": 1, "duo": 2, "text": 1, "blue_eyes": 1, "lying": 1, "green_eyes": 1, "long_hair": 3, "blonde_hair": 1, "purple_eyes": 2, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "lipstick": 2, "bedroom": 1, "sleeping": 1, "model_sheet": 1, "looking_away": 3, "big_eyes": 3, "annoyed": 2, "blurred_background": 4, "romantic_ambiance": 1, "restricted_palette": 2, "lying_on_bed": 4, "color_swatch": 3, "distracting_watermark": 1, "pajamas": 3, "playful": 1, "scenery": 2, "purple_highlights": 3, "name_tag": 3, "character_request": 3, "path_lines": 2, "resting": 1, "background_character": 2, "annoyed_expression": 1, "curtains_open": 2, "sleeping_together": 2, "scene_kid": 4, "anime_eyes": 2, "hair_sticks": 2, "stated_age": 3, "palette": 1, "contest": 2, "purple_hands": 2, "scenery_porn": 3, "purple_membrane": 1, "applying_makeup": 4, "font": 1, "eyes": 1, "english_text": 4, "animated_png": 1, "animated": 4, "2_frame_animation": 3}, "extra_evidence": {"2_frame_animation": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4462}, "<3": {"source": "probe"}, "accessory": {"source": "implied"}, "animated": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3974}, "animated_png": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4721}, "anime_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4409}, "annoyed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5731}, "annoyed_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7254}, "anthro": {"source": "probe"}, "applying_makeup": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4732}, "background_character": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3893}, "bedroom": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4904}, "big_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4292}, "blush": {"source": "probe"}, "character_request": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3755}, "clothing": {"source": "implied"}, "color_swatch": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4617}, "contest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3494}, "curtains_open": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4194}, "distracting_watermark": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5001}, "english_text": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4193}, "eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8951}, "font": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5303}, "hair_accessory": {"source": "implied"}, "hair_sticks": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5301}, "highlights_(coloring)": {"source": "implied"}, "humanoid": {"source": "structural"}, "lipstick": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4877}, "long_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.459}, "looking_away": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4294}, "lying_on_bed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4097}, "membrane_(anatomy)": {"source": "implied"}, "model_sheet": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4234}, "name_tag": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3238}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3756}, "palette": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6685}, "path_lines": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4129}, "personal_grooming": {"source": "implied"}, "playful": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4466}, "purple_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4336}, "purple_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5399}, "purple_highlights": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4302}, "purple_membrane": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5791}, "resting": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5146}, "restricted_palette": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4771}, "romantic": {"source": "implied"}, "romantic_ambiance": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4804}, "scene_kid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4097}, "scenery": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4938}, "scenery_porn": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4297}, "sleeping_together": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5087}, "stated_age": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4306}, "watermark": {"source": "implied"}}, "structural": ["duo", "humanoid", "text"], "probe": ["text", "simple_background", "duo", "blush", "anthro", "<3"], "t1": 2.32, "t2": 1.99, "t3": 12.04, "t3s": 2.47, "t3p": 2.14, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=97 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=8"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 87, "n_selected": 55, "n_implied": 14, "n_structural": 4, "n_probe": 5, "ret_R": 0.5455, "P": 0.1818, "R": 0.9091, "F1": 0.303, "leaf_P": 0.1389, "leaf_R": 0.7143, "leaf_F1": 0.2326, "n_leaf_sel": 36, "n_leaf_gt": 7, "ret_P": 0.069, "sel_given_ret": 1.6667, "over_sel": 5.0, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 37, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "32": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1818, "gen_R": 0.9091, "gen_F1": 0.303, "missed": ["open_mouth"], "extra": ["actual_fur", "animal_humanoid", "anthro", "belly", "big_nose", "blue_inner_ear_fluff", "blue_stripes", "blue_tail", "bored_expression", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "expressions", "facial_stripes", "fluffy_fur", "fox_humanoid", "glistening", "glistening_body", "glistening_fur", "glistening_tail", "gradient_tail", "half_body", "humanoid", "inner_ear_fluff", "jumper", "jumping", "light_tail", "lotus_pose", "male", "mammal_humanoid", "midriff", "pink_ears", "pink_stripes", "pink_tongue", "purple_belly", "slim_humanoid", "stripes", "tail", "tail_tuft", "tongue", "tuft", "white_inner_ear_fluff", "white_nose", "wolf_humanoid"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["actual_fur", "animal_humanoid", "anthro", "belly", "big_nose", "blue_eyes", "blue_inner_ear_fluff", "blue_nose", "blue_stripes", "blue_tail", "bored_expression", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "expressions", "facial_stripes", "fluffy_fur", "fox_humanoid", "fur", "glistening", "glistening_body", "glistening_fur", "glistening_tail", "gradient_tail", "half_body", "humanoid", "inner_ear_fluff", "jumper", "jumping", "light_tail", "lotus_pose", "male", "mammal", "mammal_humanoid", "midriff", "pink_ears", "pink_stripes", "pink_tongue", "purple_belly", "purple_body", "slim_humanoid", "solo", "stripes", "tail", "tail_tuft", "tongue", "tuft", "white_body", "white_fur", "white_inner_ear_fluff", "white_nose", "wolf_humanoid"], "stage3_selected": ["actual_fur", "animal_humanoid", "big_nose", "blue_eyes", "blue_inner_ear_fluff", "blue_nose", "blue_stripes", "blue_tail", "blurred_background", "bored_expression", "canine_humanoid", "expressions", "facial_stripes", "fluffy_fur", "fox_humanoid", "glistening_fur", "glistening_tail", "gradient_tail", "half_body", "jumper", "jumping", "light_tail", "lotus_pose", "midriff", "pink_ears", "pink_stripes", "pink_tongue", "purple_belly", "purple_body", "simple_background", "slim_humanoid", "stripes", "tail_tuft", "white_fur", "white_inner_ear_fluff", "white_nose", "wolf_humanoid"], "stage3_selected_scores": {"simple_background": 0.5948, "blue_eyes": 0.5995, "white_fur": 0.5995, "animal_humanoid": 0.6159, "stripes": 0.6068, "purple_body": 0.564, "midriff": 0.3707, "tail_tuft": 0.4994, "pink_tongue": 0.4215, "canine_humanoid": 0.9003, "blue_nose": 0.6032, "fox_humanoid": 0.8204, "blue_tail": 0.5411, "blurred_background": 0.4989, "white_inner_ear_fluff": 0.597, "wolf_humanoid": 0.819, "jumping": 0.6014, "pink_ears": 0.5255, "glistening_fur": 0.5349, "blue_stripes": 0.6748, "big_nose": 0.476, "white_nose": 0.5269, "glistening_tail": 0.5986, "expressions": 0.4957, "light_tail": 0.5671, "pink_stripes": 0.682, "blue_inner_ear_fluff": 0.4727, "fluffy_fur": 0.5593, "purple_belly": 0.5454, "gradient_tail": 0.5876, "bored_expression": 0.4512, "slim_humanoid": 0.588, "facial_stripes": 0.5968, "half_body": 0.4115, "lotus_pose": 0.4767, "jumper": 0.4077, "actual_fur": 0.4563}, "stage3_selected_ranks": {"simple_background": 28, "blue_eyes": 23, "white_fur": 24, "animal_humanoid": 14, "stripes": 18, "purple_body": 37, "midriff": 89, "tail_tuft": 59, "pink_tongue": 81, "canine_humanoid": 1, "blue_nose": 20, "fox_humanoid": 3, "blue_tail": 44, "blurred_background": 60, "white_inner_ear_fluff": 26, "wolf_humanoid": 4, "jumping": 21, "pink_ears": 51, "glistening_fur": 46, "blue_stripes": 8, "big_nose": 72, "white_nose": 50, "glistening_tail": 25, "expressions": 61, "light_tail": 36, "pink_stripes": 7, "blue_inner_ear_fluff": 74, "fluffy_fur": 38, "purple_belly": 42, "gradient_tail": 31, "bored_expression": 78, "slim_humanoid": 30, "facial_stripes": 27, "half_body": 84, "lotus_pose": 71, "jumper": 85, "actual_fur": 76}, "stage3_selected_phrase_ranks": {"simple_background": 1, "blue_eyes": 1, "white_fur": 1, "animal_humanoid": 2, "stripes": 1, "purple_body": 2, "midriff": 3, "tail_tuft": 4, "pink_tongue": 3, "canine_humanoid": 1, "blue_nose": 1, "fox_humanoid": 3, "blue_tail": 2, "blurred_background": 4, "white_inner_ear_fluff": 2, "wolf_humanoid": 4, "jumping": 1, "pink_ears": 4, "glistening_fur": 4, "blue_stripes": 1, "big_nose": 3, "white_nose": 2, "glistening_tail": 2, "expressions": 3, "light_tail": 4, "pink_stripes": 1, "blue_inner_ear_fluff": 4, "fluffy_fur": 2, "purple_belly": 4, "gradient_tail": 3, "bored_expression": 4, "slim_humanoid": 4, "facial_stripes": 3, "half_body": 4, "lotus_pose": 3, "jumper": 3, "actual_fur": 3}, "extra_evidence": {"actual_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4563}, "animal_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6159}, "anthro": {"source": "structural"}, "belly": {"source": "implied"}, "big_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.476}, "blue_inner_ear_fluff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4727}, "blue_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6748}, "blue_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5411}, "bored_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4512}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9003}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "expressions": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4957}, "facial_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5968}, "fluffy_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5593}, "fox_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8204}, "glistening": {"source": "implied"}, "glistening_body": {"source": "implied"}, "glistening_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5349}, "glistening_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5986}, "gradient_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5876}, "half_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4115}, "humanoid": {"source": "implied"}, "inner_ear_fluff": {"source": "implied"}, "jumper": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4077}, "jumping": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6014}, "light_tail": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5671}, "lotus_pose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4767}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "midriff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3707}, "pink_ears": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5255}, "pink_stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.682}, "pink_tongue": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4215}, "purple_belly": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5454}, "slim_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.588}, "stripes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6068}, "tail": {"source": "implied"}, "tail_tuft": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4994}, "tongue": {"source": "implied"}, "tuft": {"source": "implied"}, "white_inner_ear_fluff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.597}, "white_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5269}, "wolf_humanoid": {"source": "stage3", "why": "unknown", "retrieval_score": 0.819}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["solo", "simple_background", "clothing", "canid", "anthro"], "t1": 2.03, "t2": 1.93, "t3": 30.68, "t3s": 1.63, "t3p": 2.59, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=92 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 74, "n_selected": 46, "n_implied": 18, "n_structural": 5, "n_probe": 5, "ret_R": 0.3636, "P": 0.3696, "R": 0.7727, "F1": 0.5, "leaf_P": 0.125, "leaf_R": 0.25, "leaf_F1": 0.1667, "n_leaf_sel": 24, "n_leaf_gt": 12, "ret_P": 0.1081, "sel_given_ret": 2.125, "over_sel": 2.09, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 21, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "14": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3696, "gen_R": 0.7727, "gen_F1": 0.5, "missed": ["chest_tuft", "muscular", "muscular_anthro", "muscular_male", "topless"], "extra": ["bear", "belly", "blue_inner_ear_fluff", "countershade_belly", "countershade_body", "cross-hatching", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hand_on_own_head", "hatching_(art)", "inner_ear_fluff", "looking_at_viewer", "muscular_legs", "pattern_background", "pinup", "pose", "quads", "shaded", "siberian_tiger", "striped_body", "striped_fur", "tan_body", "tan_bottomwear", "tan_clothing", "tan_countershading", "tan_shorts", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "bear", "belly", "blue_eyes", "blue_inner_ear_fluff", "bottomwear", "clothed", "clothing", "countershade_belly", "countershade_body", "countershading", "cross-hatching", "felid", "fur", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hand_on_head", "hand_on_own_head", "hatching_(art)", "inner_ear_fluff", "looking_at_viewer", "male", "mammal", "muscular_legs", "pantherine", "pattern_background", "pinup", "pose", "quads", "shaded", "shorts", "siberian_tiger", "solo", "striped_body", "striped_fur", "stripes", "tan_body", "tan_bottomwear", "tan_clothing", "tan_countershading", "tan_shorts", "tiger", "tuft", "white_chest"], "stage3_selected": ["blue_eyes", "blue_inner_ear_fluff", "countershade_belly", "countershade_body", "cross-hatching", "glistening_eyes", "glistening_fur", "hand_on_head", "hand_on_own_head", "muscular_legs", "pattern_background", "pinup", "quads", "shorts", "siberian_tiger", "striped_body", "striped_fur", "tan_countershading", "tan_shorts", "tuft", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5785, "tuft": 0.497, "shorts": 0.5914, "striped_body": 0.4439, "pinup": 0.5187, "striped_fur": 0.6475, "hand_on_head": 0.6014, "glistening_eyes": 0.4769, "pattern_background": 0.5269, "quads": 0.6744, "tan_countershading": 0.7245, "glistening_fur": 0.501, "muscular_legs": 0.791, "white_chest": 0.917, "countershade_body": 0.8721, "blue_inner_ear_fluff": 0.428, "countershade_belly": 0.828, "cross-hatching": 0.4762, "siberian_tiger": 0.4939, "tan_shorts": 0.5498, "hand_on_own_head": 0.529}, "stage3_selected_ranks": {"blue_eyes": 36, "tuft": 53, "shorts": 34, "striped_body": 70, "pinup": 46, "striped_fur": 21, "hand_on_head": 29, "glistening_eyes": 61, "pattern_background": 44, "quads": 18, "tan_countershading": 13, "glistening_fur": 51, "muscular_legs": 9, "white_chest": 2, "countershade_body": 3, "blue_inner_ear_fluff": 72, "countershade_belly": 7, "cross-hatching": 62, "siberian_tiger": 56, "tan_shorts": 38, "hand_on_own_head": 43}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "tuft": 4, "shorts": 1, "striped_body": 2, "pinup": 3, "striped_fur": 2, "hand_on_head": 2, "glistening_eyes": 3, "pattern_background": 1, "quads": 4, "tan_countershading": 4, "glistening_fur": 3, "muscular_legs": 2, "white_chest": 1, "countershade_body": 1, "blue_inner_ear_fluff": 4, "countershade_belly": 2, "cross-hatching": 3, "siberian_tiger": 2, "tan_shorts": 2, "hand_on_own_head": 4}, "extra_evidence": {"bear": {"source": "probe"}, "belly": {"source": "implied"}, "blue_inner_ear_fluff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.428}, "countershade_belly": {"source": "stage3", "why": "unknown", "retrieval_score": 0.828}, "countershade_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8721}, "cross-hatching": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4762}, "glistening": {"source": "implied"}, "glistening_body": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4769}, "glistening_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.501}, "hand_on_own_head": {"source": "stage3", "why": "unknown", "retrieval_score": 0.529}, "hatching_(art)": {"source": "implied"}, "inner_ear_fluff": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}, "muscular_legs": {"source": "stage3", "why": "unknown", "retrieval_score": 0.791}, "pattern_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5269}, "pinup": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5187}, "pose": {"source": "implied"}, "quads": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6744}, "shaded": {"source": "implied"}, "siberian_tiger": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4939}, "striped_body": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4439}, "striped_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6475}, "tan_body": {"source": "implied"}, "tan_bottomwear": {"source": "implied"}, "tan_clothing": {"source": "implied"}, "tan_countershading": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7245}, "tan_shorts": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5498}, "white_chest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.917}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["solo", "felid", "clothing", "bear", "anthro"], "t1": 1.84, "t2": 1.74, "t3": 4.2, "t3s": 1.32, "t3p": 1.64, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=74 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 66, "n_selected": 78, "n_implied": 27, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.1282, "R": 0.8333, "F1": 0.2222, "leaf_P": 0.093, "leaf_R": 0.4444, "leaf_F1": 0.1538, "n_leaf_sel": 43, "n_leaf_gt": 9, "ret_P": 0.0455, "sel_given_ret": 3.3333, "over_sel": 6.5, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 46, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "10": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1282, "gen_R": 0.8333, "gen_F1": 0.2222, "missed": ["alpha_channel", "fingers"], "extra": ["beverage", "big_hands", "black_necktie", "black_nose", "blue_clothing", "blue_shirt", "blue_topwear", "bottom_heavy", "bottomwear", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "clasped_hands", "coffee_cup", "coffee_mug", "container", "cup", "domestic_cat", "dress_shirt", "felis", "formal", "frown", "gesture", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "hands_together", "handshake", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "jacket", "jacket_vest", "left-handed", "mug", "necktie", "pants", "running", "scowl", "shirt", "sketch", "sketch_background", "suit_jacket", "sweater", "sweater_vest", "t-shirt", "tan_body", "tan_bottomwear", "tan_clothing", "tan_fur", "tan_pants", "teal_shirt", "teal_topwear", "text", "thick_thighs", "topwear", "undershirt", "vest", "white_clothing", "white_necktie", "white_topwear", "wide_hips", "yellow_background"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beverage", "big_hands", "black_necktie", "black_nose", "blue_clothing", "blue_shirt", "blue_topwear", "bottom_heavy", "bottomwear", "brown_clothing", "brown_topwear", "brown_vest", "business_attire", "clasped_hands", "clothed", "clothing", "coffee_cup", "coffee_mug", "container", "cup", "domestic_cat", "dress_shirt", "felid", "feline", "felis", "formal", "frown", "fur", "gesture", "green_background", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "hands_together", "handshake", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "jacket", "jacket_vest", "left-handed", "male", "mammal", "mug", "necktie", "pants", "running", "scowl", "shirt", "sketch", "sketch_background", "solo", "suit_jacket", "sweater", "sweater_vest", "t-shirt", "tan_body", "tan_bottomwear", "tan_clothing", "tan_fur", "tan_pants", "teal_shirt", "teal_topwear", "text", "thick_thighs", "topwear", "undershirt", "vest", "white_clothing", "white_necktie", "white_topwear", "wide_hips", "yellow_background"], "stage3_selected": ["big_hands", "black_necktie", "black_nose", "blue_shirt", "bottom_heavy", "brown_vest", "business_attire", "clasped_hands", "coffee_cup", "coffee_mug", "domestic_cat", "dress_shirt", "feline", "formal", "fur", "green_background", "grey_shirt", "hair_bun", "hands_together", "handshake", "holding_beverage", "holding_cup", "holding_mug", "invalid_background", "jacket_vest", "left-handed", "mug", "necktie", "running", "scowl", "shirt", "simple_background", "sketch_background", "suit_jacket", "sweater_vest", "t-shirt", "tan_fur", "tan_pants", "teal_shirt", "thick_thighs", "undershirt", "vest", "white_necktie", "white_topwear", "wide_hips", "yellow_background"], "stage3_selected_scores": {"fur": 0.7146, "simple_background": 0.6978, "feline": 0.7062, "thick_thighs": 0.4711, "shirt": 0.7998, "domestic_cat": 0.6329, "wide_hips": 0.4732, "black_nose": 0.6261, "tan_fur": 0.5779, "necktie": 0.7314, "t-shirt": 0.7846, "white_topwear": 0.7154, "vest": 0.8403, "green_background": 0.6069, "running": 0.5147, "yellow_background": 0.6334, "dress_shirt": 0.6132, "blue_shirt": 0.751, "holding_cup": 0.7667, "hair_bun": 0.6926, "big_hands": 0.4968, "holding_beverage": 0.7721, "bottom_heavy": 0.4663, "coffee_mug": 0.7055, "mug": 0.8841, "hands_together": 0.5547, "grey_shirt": 0.7582, "coffee_cup": 0.6863, "undershirt": 0.7599, "scowl": 0.5567, "sweater_vest": 0.7532, "holding_mug": 0.916, "clasped_hands": 0.6268, "suit_jacket": 0.5924, "black_necktie": 0.7132, "tan_pants": 0.7373, "handshake": 0.5511, "formal": 0.5993, "business_attire": 0.5657, "left-handed": 0.5479, "sketch_background": 0.5928, "jacket_vest": 0.772, "brown_vest": 0.8153, "teal_shirt": 0.7474, "white_necktie": 0.6418, "invalid_background": 0.6495}, "stage3_selected_ranks": {"fur": 23, "simple_background": 28, "feline": 25, "thick_thighs": 68, "shirt": 5, "domestic_cat": 37, "wide_hips": 67, "black_nose": 40, "tan_fur": 52, "necktie": 19, "t-shirt": 7, "white_topwear": 22, "vest": 3, "green_background": 45, "running": 64, "yellow_background": 36, "dress_shirt": 42, "blue_shirt": 16, "holding_cup": 10, "hair_bun": 29, "big_hands": 65, "holding_beverage": 8, "bottom_heavy": 69, "coffee_mug": 26, "mug": 2, "hands_together": 56, "grey_shirt": 12, "coffee_cup": 30, "undershirt": 11, "scowl": 55, "sweater_vest": 14, "holding_mug": 1, "clasped_hands": 39, "suit_jacket": 48, "black_necktie": 24, "tan_pants": 18, "handshake": 59, "formal": 46, "business_attire": 54, "left-handed": 60, "sketch_background": 47, "jacket_vest": 9, "brown_vest": 4, "teal_shirt": 17, "white_necktie": 33, "invalid_background": 31}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "feline": 1, "thick_thighs": 3, "shirt": 1, "domestic_cat": 4, "wide_hips": 2, "black_nose": 2, "tan_fur": 4, "necktie": 1, "t-shirt": 2, "white_topwear": 1, "vest": 1, "green_background": 2, "running": 3, "yellow_background": 3, "dress_shirt": 2, "blue_shirt": 3, "holding_cup": 4, "hair_bun": 1, "big_hands": 4, "holding_beverage": 3, "bottom_heavy": 4, "coffee_mug": 3, "mug": 1, "hands_together": 2, "grey_shirt": 1, "coffee_cup": 4, "undershirt": 3, "scowl": 4, "sweater_vest": 3, "holding_mug": 1, "clasped_hands": 1, "suit_jacket": 4, "black_necktie": 2, "tan_pants": 3, "handshake": 3, "formal": 1, "business_attire": 1, "left-handed": 4, "sketch_background": 3, "jacket_vest": 2, "brown_vest": 1, "teal_shirt": 4, "white_necktie": 4, "invalid_background": 1}, "extra_evidence": {"beverage": {"source": "implied"}, "big_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4968}, "black_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7132}, "black_nose": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6261}, "blue_clothing": {"source": "implied"}, "blue_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.751}, "blue_topwear": {"source": "implied"}, "bottom_heavy": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4663}, "bottomwear": {"source": "implied"}, "brown_clothing": {"source": "implied"}, "brown_topwear": {"source": "implied"}, "brown_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8153}, "business_attire": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5657}, "clasped_hands": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6268}, "coffee_cup": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6863}, "coffee_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7055}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "domestic_cat": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6329}, "dress_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6132}, "felis": {"source": "implied"}, "formal": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5993}, "frown": {"source": "implied"}, "gesture": {"source": "implied"}, "green_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6069}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6926}, "hands_together": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5547}, "handshake": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5511}, "holding_beverage": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7721}, "holding_container": {"source": "implied"}, "holding_cup": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7667}, "holding_mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "jacket": {"source": "implied"}, "jacket_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.772}, "left-handed": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5479}, "mug": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7314}, "pants": {"source": "implied"}, "running": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5147}, "scowl": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5567}, "shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7998}, "sketch": {"source": "implied"}, "sketch_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5928}, "suit_jacket": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5924}, "sweater": {"source": "implied"}, "sweater_vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7532}, "t-shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7846}, "tan_body": {"source": "implied"}, "tan_bottomwear": {"source": "implied"}, "tan_clothing": {"source": "implied"}, "tan_fur": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5779}, "tan_pants": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7373}, "teal_shirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "thick_thighs": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4711}, "topwear": {"source": "implied"}, "undershirt": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7599}, "vest": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8403}, "white_clothing": {"source": "implied"}, "white_necktie": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6418}, "white_topwear": {"source": "stage3", "why": "unknown", "retrieval_score": 0.7154}, "wide_hips": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4732}, "yellow_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.6334}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["text", "solo", "felid", "clothing", "anthro"], "t1": 2.99, "t2": 1.43, "t3": 6.03, "t3s": 0.98, "t3p": 0.75, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=70 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 66, "n_selected": 49, "n_implied": 9, "n_structural": 6, "n_probe": 5, "ret_R": 0.5714, "P": 0.2245, "R": 0.7857, "F1": 0.3492, "leaf_P": 0.0625, "leaf_R": 0.2, "leaf_F1": 0.0952, "n_leaf_sel": 32, "n_leaf_gt": 10, "ret_P": 0.1212, "sel_given_ret": 1.375, "over_sel": 3.5, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 34, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "9": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2245, "gen_R": 0.7857, "gen_F1": 0.3492, "missed": ["fur", "human", "male"], "extra": ["anthro", "balancing", "blonde_hair", "bottomwear", "cheek_to_cheek", "chimpanzee", "crossed_arms", "dancer_outfit", "duo", "feral", "flash", "front_view", "gorilla", "grin", "grinning_at_viewer", "interactive", "laugh", "loincloth", "no_sound", "one_eye_closed", "pan_(genus)", "raised_arms", "raised_leg", "red_hair", "shaggy_hair", "smile", "smiling_at_viewer", "smirk", "smirking_at_viewer", "smug_grin", "staff", "toony_expression", "topless", "trio", "wide_grin", "wink", "winking_at_viewer", "yelling"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "balancing", "bear", "blonde_hair", "bottomwear", "cheek_to_cheek", "chimpanzee", "clothed", "clothing", "crossed_arms", "dancer_outfit", "dancing", "duo", "feral", "flash", "front_view", "gorilla", "grin", "grinning_at_viewer", "group", "hair", "haplorhine", "interactive", "laugh", "loincloth", "looking_at_viewer", "mammal", "no_sound", "one_eye_closed", "pan_(genus)", "primate", "raised_arms", "raised_leg", "red_hair", "shaggy_hair", "smile", "smiling_at_viewer", "smirk", "smirking_at_viewer", "smug_grin", "staff", "toony_expression", "topless", "trio", "wide_grin", "wink", "winking_at_viewer", "yelling"], "stage3_selected": ["ape", "balancing", "bear", "blonde_hair", "cheek_to_cheek", "chimpanzee", "crossed_arms", "dancer_outfit", "dancing", "flash", "front_view", "gorilla", "grin", "grinning_at_viewer", "hair", "interactive", "laugh", "loincloth", "looking_at_viewer", "no_sound", "primate", "raised_arms", "raised_leg", "red_hair", "shaggy_hair", "simple_background", "smirk", "smirking_at_viewer", "smug_grin", "staff", "toony_expression", "wide_grin", "winking_at_viewer", "yelling"], "stage3_selected_scores": {"hair": 0.5445, "simple_background": 0.5491, "looking_at_viewer": 0.5475, "blonde_hair": 0.3637, "red_hair": 0.3652, "bear": 0.5735, "front_view": 0.4609, "grin": 0.5653, "raised_leg": 0.4324, "smirk": 0.3593, "primate": 0.8905, "loincloth": 0.5685, "crossed_arms": 0.421, "dancing": 0.5568, "laugh": 0.5259, "staff": 0.3682, "ape": 0.9767, "raised_arms": 0.5445, "yelling": 0.3709, "gorilla": 0.8299, "winking_at_viewer": 0.404, "smug_grin": 0.3703, "flash": 0.3205, "chimpanzee": 0.8275, "smirking_at_viewer": 0.4352, "no_sound": 0.2973, "balancing": 0.4094, "interactive": 0.4085, "grinning_at_viewer": 0.442, "shaggy_hair": 0.3489, "dancer_outfit": 0.4163, "cheek_to_cheek": 0.3714, "toony_expression": 0.4685, "wide_grin": 0.5267}, "stage3_selected_ranks": {"hair": 14, "simple_background": 11, "looking_at_viewer": 12, "blonde_hair": 63, "red_hair": 61, "bear": 6, "front_view": 27, "grin": 8, "raised_leg": 36, "smirk": 64, "primate": 2, "loincloth": 7, "crossed_arms": 41, "dancing": 10, "laugh": 16, "staff": 60, "ape": 1, "raised_arms": 13, "yelling": 57, "gorilla": 4, "winking_at_viewer": 47, "smug_grin": 58, "flash": 69, "chimpanzee": 5, "smirking_at_viewer": 34, "no_sound": 71, "balancing": 45, "interactive": 46, "grinning_at_viewer": 32, "shaggy_hair": 66, "dancer_outfit": 43, "cheek_to_cheek": 56, "toony_expression": 25, "wide_grin": 15}, "stage3_selected_phrase_ranks": {"hair": 1, "simple_background": 1, "looking_at_viewer": 1, "blonde_hair": 4, "red_hair": 3, "bear": 1, "front_view": 2, "grin": 1, "raised_leg": 2, "smirk": 3, "primate": 1, "loincloth": 1, "crossed_arms": 4, "dancing": 1, "laugh": 1, "staff": 4, "ape": 1, "raised_arms": 1, "yelling": 3, "gorilla": 2, "winking_at_viewer": 4, "smug_grin": 4, "flash": 2, "chimpanzee": 3, "smirking_at_viewer": 4, "no_sound": 4, "balancing": 4, "interactive": 1, "grinning_at_viewer": 3, "shaggy_hair": 3, "dancer_outfit": 3, "cheek_to_cheek": 3, "toony_expression": 1, "wide_grin": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "balancing": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4094}, "blonde_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3637}, "bottomwear": {"source": "implied"}, "cheek_to_cheek": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3714}, "chimpanzee": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8275}, "crossed_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.421}, "dancer_outfit": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4163}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "flash": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3205}, "front_view": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4609}, "gorilla": {"source": "stage3", "why": "unknown", "retrieval_score": 0.8299}, "grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5653}, "grinning_at_viewer": {"source": "stage3", "why": "unknown", "retrieval_score": 0.442}, "interactive": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4085}, "laugh": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5259}, "loincloth": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5685}, "no_sound": {"source": "stage3", "why": "unknown", "retrieval_score": 0.2973}, "one_eye_closed": {"source": "implied"}, "pan_(genus)": {"source": "implied"}, "raised_arms": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5445}, "raised_leg": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4324}, "red_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3652}, "shaggy_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3489}, "smile": {"source": "implied"}, "smiling_at_viewer": {"source": "implied"}, "smirk": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3593}, "smirking_at_viewer": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4352}, "smug_grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3703}, "staff": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3682}, "toony_expression": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4685}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5267}, "wink": {"source": "implied"}, "winking_at_viewer": {"source": "stage3", "why": "unknown", "retrieval_score": 0.404}, "yelling": {"source": "stage3", "why": "unknown", "retrieval_score": 0.3709}}, "structural": ["trio", "anthro", "feral", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "group", "duo", "bear", "anthro"], "t1": 3.18, "t2": 1.78, "t3": 4.3, "t3s": 1.95, "t3p": 1.6, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=69 entity=1 copyright_filtered=1 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/k_sweep_explicit_no_why_seed42_k6.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
data/eval_results/latency_baseline_seed42.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T05:51:59.829566", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 10, "temperature": 0.0, "shuffle": false, "seed": 42, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 27}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 150, "n_selected": 59, "n_implied": 26, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.1525, "R": 0.75, "F1": 0.2535, "leaf_P": 0.0741, "leaf_R": 0.2222, "leaf_F1": 0.1111, "n_leaf_sel": 27, "n_leaf_gt": 9, "ret_P": 0.02, "sel_given_ret": 3.0, "over_sel": 4.92, "why": {"explicit": 24, "strong_implied": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 86, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "33": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1525, "gen_R": 0.75, "gen_F1": 0.2535, "missed": ["alpha_channel", "fingers", "male"], "extra": ["beverage", "black_body", "black_clothing", "black_fur", "black_shirt", "black_topwear", "black_vest", "business_attire", "business_suit", "businesswear", "coffee", "container", "cup", "domestic_cat", "felis", "formal", "green_clothing", "green_shirt", "green_t-shirt", "green_topwear", "green_vest", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "holding_container", "holding_cup", "holding_mug", "holding_object", "jacket", "mug", "necktie", "polo_shirt", "red_clothing", "red_topwear", "red_vest", "shirt", "sleeveless_shirt", "suit", "suit_jacket", "t-shirt", "text", "topless", "topwear", "vest", "waiter", "white_clothing", "white_necktie", "white_shirt", "white_topwear"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beverage", "black_body", "black_clothing", "black_fur", "black_shirt", "black_topwear", "black_vest", "business_attire", "business_suit", "businesswear", "clothed", "clothing", "coffee", "container", "cup", "domestic_cat", "felid", "feline", "felis", "formal", "fur", "green_clothing", "green_shirt", "green_t-shirt", "green_topwear", "green_vest", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "holding_container", "holding_cup", "holding_mug", "holding_object", "jacket", "mammal", "mug", "necktie", "polo_shirt", "red_clothing", "red_topwear", "red_vest", "shirt", "sleeveless_shirt", "solo", "suit", "suit_jacket", "t-shirt", "text", "topless", "topwear", "vest", "waiter", "white_clothing", "white_necktie", "white_shirt", "white_topwear"], "stage3_selected": ["black_fur", "black_shirt", "black_vest", "business_attire", "business_suit", "businesswear", "coffee", "domestic_cat", "formal", "green_t-shirt", "green_vest", "grey_shirt", "hair_bun", "holding_cup", "holding_mug", "jacket", "polo_shirt", "red_vest", "shirt", "simple_background", "sleeveless_shirt", "suit_jacket", "topwear", "vest", "waiter", "white_necktie", "white_shirt"], "stage3_selected_scores": {"simple_background": 0.7012, "topwear": 0.7053, "shirt": 0.8041, "domestic_cat": 0.6355, "black_fur": 0.722, "jacket": 0.6702, "vest": 0.8437, "white_shirt": 0.7408, "black_shirt": 0.7383, "coffee": 0.6427, "holding_cup": 0.7694, "hair_bun": 0.6946, "waiter": 0.5913, "sleeveless_shirt": 0.7091, "grey_shirt": 0.7606, "business_suit": 0.5775, "polo_shirt": 0.7142, "holding_mug": 0.9184, "black_vest": 0.7142, "businesswear": 0.5741, "green_t-shirt": 0.7407, "green_vest": 0.7238, "red_vest": 0.6652, "white_necktie": 0.644, "suit_jacket": 0.6893, "formal": 0.601, "business_attire": 0.5683}, "stage3_selected_ranks": {"simple_background": 49, "topwear": 47, "shirt": 5, "domestic_cat": 72, "black_fur": 35, "jacket": 58, "vest": 3, "white_shirt": 22, "black_shirt": 25, "coffee": 67, "holding_cup": 10, "hair_bun": 52, "waiter": 89, "sleeveless_shirt": 45, "grey_shirt": 14, "business_suit": 96, "polo_shirt": 41, "holding_mug": 1, "black_vest": 42, "businesswear": 100, "green_t-shirt": 23, "green_vest": 34, "red_vest": 59, "white_necktie": 66, "suit_jacket": 54, "formal": 86, "business_attire": 102}, "stage3_selected_phrase_ranks": {"simple_background": 1, "topwear": 9, "shirt": 1, "domestic_cat": 4, "black_fur": 1, "jacket": 8, "vest": 1, "white_shirt": 4, "black_shirt": 8, "coffee": 8, "holding_cup": 4, "hair_bun": 1, "waiter": 5, "sleeveless_shirt": 7, "grey_shirt": 1, "business_suit": 8, "polo_shirt": 6, "holding_mug": 1, "black_vest": 9, "businesswear": 9, "green_t-shirt": 7, "green_vest": 6, "red_vest": 9, "white_necktie": 10, "suit_jacket": 4, "formal": 1, "business_attire": 1}, "extra_evidence": {"beverage": {"source": "implied"}, "black_body": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.722}, "black_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7383}, "black_topwear": {"source": "implied"}, "black_vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7142}, "business_attire": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5683}, "business_suit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5775}, "businesswear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5741}, "coffee": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6427}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "domestic_cat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6355}, "felis": {"source": "implied"}, "formal": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.601}, "green_clothing": {"source": "implied"}, "green_shirt": {"source": "implied"}, "green_t-shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7407}, "green_topwear": {"source": "implied"}, "green_vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7238}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7606}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6946}, "holding_container": {"source": "implied"}, "holding_cup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7694}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9184}, "holding_object": {"source": "implied"}, "jacket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6702}, "mug": {"source": "implied"}, "necktie": {"source": "implied"}, "polo_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7142}, "red_clothing": {"source": "implied"}, "red_topwear": {"source": "implied"}, "red_vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6652}, "shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8041}, "sleeveless_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7091}, "suit": {"source": "implied"}, "suit_jacket": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6893}, "t-shirt": {"source": "implied"}, "text": {"source": "probe"}, "topless": {"source": "structural"}, "topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7053}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8437}, "waiter": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5913}, "white_clothing": {"source": "implied"}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.644}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7408}, "white_topwear": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed", "topless"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 2.31, "t2": 1.8, "t3": 47.39, "t3s": 6.34, "t3p": 8.32, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=153 entity=1 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 160, "n_selected": 42, "n_implied": 14, "n_structural": 6, "n_probe": 6, "ret_R": 0.5714, "P": 0.2619, "R": 0.7857, "F1": 0.3929, "leaf_P": 0.0435, "leaf_R": 0.1, "leaf_F1": 0.0606, "n_leaf_sel": 23, "n_leaf_gt": 10, "ret_P": 0.05, "sel_given_ret": 1.375, "over_sel": 3.0, "why": {"explicit": 15, "strong_implied": 5}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 82, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "41": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2619, "gen_R": 0.7857, "gen_F1": 0.3929, "missed": ["fur", "human", "male"], "extra": ["<3", "american_black_bear", "anthro", "arms_above_head", "auburn_hair", "black_bear", "bottomwear", "chimpanzee", "duo", "feral", "gesture", "gorilla", "grinning_at_viewer", "kermode_bear", "loincloth", "monkey", "one_eye_closed", "pan_(genus)", "pointing", "pointing_at_viewer", "raised_arms", "sloth_bear", "smug_eyes", "spread_arms", "sun_bear", "tap_dancing", "topless", "trio", "ursine", "wink", "winking_at_viewer"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["<3", "american_black_bear", "anthro", "ape", "arms_above_head", "auburn_hair", "bear", "black_bear", "bottomwear", "chimpanzee", "clothed", "clothing", "dancing", "duo", "feral", "gesture", "gorilla", "grinning_at_viewer", "group", "hair", "haplorhine", "kermode_bear", "loincloth", "looking_at_viewer", "mammal", "monkey", "one_eye_closed", "pan_(genus)", "pointing", "pointing_at_viewer", "primate", "raised_arms", "sloth_bear", "smug_eyes", "spread_arms", "sun_bear", "tap_dancing", "topless", "trio", "ursine", "wink", "winking_at_viewer"], "stage3_selected": ["arms_above_head", "auburn_hair", "bear", "chimpanzee", "dancing", "gorilla", "grinning_at_viewer", "kermode_bear", "loincloth", "looking_at_viewer", "monkey", "pointing_at_viewer", "primate", "raised_arms", "sloth_bear", "smug_eyes", "spread_arms", "sun_bear", "tap_dancing", "winking_at_viewer"], "stage3_selected_scores": {"bear": 0.5735, "primate": 0.8905, "loincloth": 0.5685, "monkey": 0.7558, "arms_above_head": 0.3975, "gorilla": 0.8299, "spread_arms": 0.4027, "winking_at_viewer": 0.4324, "pointing_at_viewer": 0.4203, "sloth_bear": 0.4453, "chimpanzee": 0.8275, "auburn_hair": 0.346, "sun_bear": 0.4335, "kermode_bear": 0.4397, "smug_eyes": 0.3465, "looking_at_viewer": 0.5475, "dancing": 0.5568, "raised_arms": 0.5445, "grinning_at_viewer": 0.442, "tap_dancing": 0.4339}, "stage3_selected_ranks": {"bear": 12, "primate": 2, "loincloth": 13, "monkey": 6, "arms_above_head": 81, "gorilla": 4, "spread_arms": 76, "winking_at_viewer": 53, "pointing_at_viewer": 63, "sloth_bear": 41, "chimpanzee": 5, "auburn_hair": 130, "sun_bear": 50, "kermode_bear": 44, "smug_eyes": 128, "looking_at_viewer": 18, "dancing": 16, "raised_arms": 19, "grinning_at_viewer": 42, "tap_dancing": 49}, "stage3_selected_phrase_ranks": {"bear": 1, "primate": 1, "loincloth": 1, "monkey": 6, "arms_above_head": 5, "gorilla": 2, "spread_arms": 5, "winking_at_viewer": 4, "pointing_at_viewer": 8, "sloth_bear": 6, "chimpanzee": 3, "auburn_hair": 7, "sun_bear": 10, "kermode_bear": 7, "smug_eyes": 5, "looking_at_viewer": 1, "dancing": 1, "raised_arms": 1, "grinning_at_viewer": 3, "tap_dancing": 2}, "extra_evidence": {"<3": {"source": "probe"}, "american_black_bear": {"source": "implied"}, "anthro": {"source": "structural"}, "arms_above_head": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3975}, "auburn_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.346}, "black_bear": {"source": "implied"}, "bottomwear": {"source": "implied"}, "chimpanzee": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8275}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "gesture": {"source": "implied"}, "gorilla": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8299}, "grinning_at_viewer": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.442}, "kermode_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4397}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5685}, "monkey": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7558}, "one_eye_closed": {"source": "implied"}, "pan_(genus)": {"source": "implied"}, "pointing": {"source": "implied"}, "pointing_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4203}, "raised_arms": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5445}, "sloth_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4453}, "smug_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3465}, "spread_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4027}, "sun_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4335}, "tap_dancing": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4339}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "ursine": {"source": "implied"}, "wink": {"source": "implied"}, "winking_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4324}}, "structural": ["trio", "anthro", "feral", "clothed", "topless", "looking_at_viewer"], "probe": ["anthro", "duo", "group", "bear", "simple_background", "<3"], "t1": 3.37, "t2": 4.07, "t3": 31.86, "t3s": 5.95, "t3p": 4.45, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=161 entity=5 copyright_filtered=2 generic_char_to_general=1 unknown_type=3"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 133, "n_selected": 37, "n_implied": 12, "n_structural": 4, "n_probe": 5, "ret_R": 0.7143, "P": 0.3243, "R": 0.8571, "F1": 0.4706, "leaf_P": 0.2609, "leaf_R": 0.6667, "leaf_F1": 0.375, "n_leaf_sel": 23, "n_leaf_gt": 9, "ret_P": 0.0752, "sel_given_ret": 1.2, "over_sel": 2.64, "why": {"explicit": 19}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 62, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "11": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3243, "gen_R": 0.8571, "gen_F1": 0.4706, "missed": ["romantic", "romantic_couple"], "extra": ["<3", "blue_hair", "coat", "confident", "diaper", "domestic_rabbit", "dutch_rabbit", "expressions", "hair", "hand_holding", "holding_object", "holding_plushie", "holding_toy", "lab_coat", "looking_at_viewer", "oryctolagus", "relationship", "round_ears", "setting", "teddy_bear", "topwear", "touching_diaper", "toy", "vest", "winter_coat"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blue_hair", "blush", "clothed", "clothing", "coat", "confident", "diaper", "domestic_rabbit", "duo", "dutch_rabbit", "expressions", "hair", "hand_holding", "holding_object", "holding_plushie", "holding_toy", "lab_coat", "lagomorph", "leporid", "looking_at_viewer", "mammal", "oryctolagus", "plushie", "rabbit", "relationship", "round_ears", "setting", "teal_eyes", "teddy_bear", "topwear", "touching_diaper", "toy", "vest", "winter_coat"], "stage3_selected": ["blue_eyes", "blue_hair", "coat", "confident", "duo", "dutch_rabbit", "expressions", "hand_holding", "holding_plushie", "holding_toy", "lab_coat", "relationship", "round_ears", "setting", "teal_eyes", "teddy_bear", "touching_diaper", "vest", "winter_coat"], "stage3_selected_scores": {"duo": 0.3628, "blue_eyes": 0.6151, "blue_hair": 0.4202, "hand_holding": 0.4283, "coat": 0.6383, "vest": 0.5028, "teal_eyes": 0.6283, "lab_coat": 0.516, "round_ears": 0.4343, "teddy_bear": 0.5459, "confident": 0.5161, "expressions": 0.5454, "touching_diaper": 0.4638, "holding_plushie": 0.7793, "winter_coat": 0.4759, "dutch_rabbit": 0.4583, "holding_toy": 0.5855, "relationship": 0.6206, "setting": 0.5567}, "stage3_selected_ranks": {"duo": 131, "blue_eyes": 12, "blue_hair": 118, "hand_holding": 112, "coat": 7, "vest": 47, "teal_eyes": 8, "lab_coat": 40, "round_ears": 102, "teddy_bear": 22, "confident": 39, "expressions": 23, "touching_diaper": 75, "holding_plushie": 2, "winter_coat": 66, "dutch_rabbit": 80, "holding_toy": 14, "relationship": 9, "setting": 20}, "stage3_selected_phrase_ranks": {"duo": 3, "blue_eyes": 1, "blue_hair": 8, "hand_holding": 9, "coat": 1, "vest": 6, "teal_eyes": 1, "lab_coat": 5, "round_ears": 10, "teddy_bear": 5, "confident": 7, "expressions": 2, "touching_diaper": 6, "holding_plushie": 1, "winter_coat": 10, "dutch_rabbit": 4, "holding_toy": 4, "relationship": 1, "setting": 1}, "extra_evidence": {"<3": {"source": "probe"}, "blue_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4202}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "confident": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5161}, "diaper": {"source": "implied"}, "domestic_rabbit": {"source": "implied"}, "dutch_rabbit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4583}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5454}, "hair": {"source": "implied"}, "hand_holding": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4283}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "holding_toy": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5855}, "lab_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.516}, "looking_at_viewer": {"source": "structural"}, "oryctolagus": {"source": "implied"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "round_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4343}, "setting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5567}, "teddy_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5459}, "topwear": {"source": "implied"}, "touching_diaper": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4638}, "toy": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5028}, "winter_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4759}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 2.39, "t2": 3.53, "t3": 32.53, "t3s": 5.33, "t3p": 6.83, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=131 entity=2 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 124, "n_selected": 30, "n_implied": 5, "n_structural": 5, "n_probe": 3, "ret_R": 0.75, "P": 0.1333, "R": 1.0, "F1": 0.2353, "leaf_P": 0.125, "leaf_R": 0.75, "leaf_F1": 0.2143, "n_leaf_sel": 24, "n_leaf_gt": 4, "ret_P": 0.0242, "sel_given_ret": 1.3333, "over_sel": 7.5, "why": {"explicit": 20}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 55, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "5": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1333, "gen_R": 1.0, "gen_F1": 0.2353, "missed": [], "extra": ["ambiguous_gender", "anthro", "belly", "covering", "covering_mouth", "feral", "floating", "grinning_at_viewer", "looking_at_viewer", "nose", "nude", "red_spots", "round_eyes", "smiling_at_viewer", "smirk", "smirking_at_viewer", "spots", "spotted_legs", "tan_belly", "tan_chest", "tan_face", "tan_head", "tan_stripes", "toony", "toony_eyes", "wide_eyed"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "belly", "covering", "covering_mouth", "feral", "floating", "grinning_at_viewer", "looking_at_viewer", "nose", "nude", "red_nose", "red_spots", "round_eyes", "smile", "smiling_at_viewer", "smirk", "smirking_at_viewer", "solo", "spots", "spotted_legs", "tan_belly", "tan_body", "tan_chest", "tan_face", "tan_head", "tan_stripes", "toony", "toony_eyes", "wide_eyed"], "stage3_selected": ["covering_mouth", "floating", "grinning_at_viewer", "nose", "red_nose", "red_spots", "round_eyes", "smile", "smirking_at_viewer", "spotted_legs", "tan_belly", "tan_body", "tan_chest", "tan_face", "tan_head", "tan_stripes", "toony", "toony_eyes", "white_background", "wide_eyed"], "stage3_selected_scores": {"smile": 0.6098, "white_background": 0.6267, "tan_body": 0.6777, "toony": 0.638, "wide_eyed": 0.4762, "red_nose": 0.7461, "floating": 0.6778, "covering_mouth": 0.3954, "tan_belly": 0.6631, "tan_face": 0.7162, "spotted_legs": 0.6719, "tan_chest": 0.7032, "smirking_at_viewer": 0.4548, "red_spots": 0.6307, "round_eyes": 0.8856, "grinning_at_viewer": 0.4958, "tan_head": 0.6682, "tan_stripes": 0.6821, "toony_eyes": 0.3921, "nose": 0.8851}, "stage3_selected_ranks": {"smile": 45, "white_background": 40, "tan_body": 19, "toony": 35, "wide_eyed": 89, "red_nose": 4, "floating": 18, "covering_mouth": 120, "tan_belly": 27, "tan_face": 8, "spotted_legs": 21, "tan_chest": 11, "smirking_at_viewer": 99, "red_spots": 39, "round_eyes": 2, "grinning_at_viewer": 83, "tan_head": 23, "tan_stripes": 15, "toony_eyes": 121, "nose": 3}, "stage3_selected_phrase_ranks": {"smile": 2, "white_background": 1, "tan_body": 6, "toony": 1, "wide_eyed": 8, "red_nose": 1, "floating": 1, "covering_mouth": 10, "tan_belly": 10, "tan_face": 1, "spotted_legs": 5, "tan_chest": 4, "smirking_at_viewer": 8, "red_spots": 10, "round_eyes": 1, "grinning_at_viewer": 6, "tan_head": 8, "tan_stripes": 5, "toony_eyes": 7, "nose": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "belly": {"source": "implied"}, "covering": {"source": "implied"}, "covering_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3954}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6778}, "grinning_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4958}, "looking_at_viewer": {"source": "structural"}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8851}, "nude": {"source": "structural"}, "red_spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6307}, "round_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8856}, "smiling_at_viewer": {"source": "implied"}, "smirk": {"source": "implied"}, "smirking_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4548}, "spots": {"source": "implied"}, "spotted_legs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6719}, "tan_belly": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6631}, "tan_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7032}, "tan_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7162}, "tan_head": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6682}, "tan_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6821}, "toony": {"source": "stage3", "why": "explicit", "retrieval_score": 0.638}, "toony_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3921}, "wide_eyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4762}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["simple_background", "anthro", "solo"], "t1": 4.89, "t2": 1.55, "t3": 21.08, "t3s": 3.65, "t3p": 8.37, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=125 entity=0 copyright_filtered=4 generic_char_to_general=0 unknown_type=5"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 10, "n_selected": 11, "n_implied": 3, "n_structural": 5, "n_probe": 3, "ret_R": 0.0, "P": 0.6364, "R": 0.3182, "F1": 0.4242, "leaf_P": 0.125, "leaf_R": 0.0833, "leaf_F1": 0.1, "n_leaf_sel": 8, "n_leaf_gt": 12, "ret_P": 0.0, "sel_given_ret": 0.0, "over_sel": 0.5, "why": {"explicit": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 2, "attempts_by_n_local": {"7": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6364, "gen_R": 0.3182, "gen_F1": 0.4242, "missed": ["blue_eyes", "bottomwear", "chest_tuft", "countershading", "fur", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "stripes", "tiger", "topless", "tuft"], "extra": ["comic", "doujinshi", "humor", "looking_at_viewer"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "clothed", "clothing", "comic", "doujinshi", "felid", "humor", "looking_at_viewer", "male", "mammal", "solo"], "stage3_selected": ["doujinshi", "humor"], "stage3_selected_scores": {"humor": 0.442, "doujinshi": 0.3981}, "stage3_selected_ranks": {"humor": 3, "doujinshi": 5}, "stage3_selected_phrase_ranks": {"humor": 3, "doujinshi": 5}, "extra_evidence": {"comic": {"source": "implied"}, "doujinshi": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3981}, "humor": {"source": "stage3", "why": "explicit", "retrieval_score": 0.442}, "looking_at_viewer": {"source": "structural"}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "felid", "solo"], "t1": 7.53, "t2": 0.08, "t3": 1.75, "t3s": 0.99, "t3p": 1.53, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=7 entity=0 copyright_filtered=3 generic_char_to_general=0 unknown_type=0"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 174, "n_selected": 8, "n_implied": 0, "n_structural": 3, "n_probe": 4, "ret_R": 0.6923, "P": 0.125, "R": 0.0769, "F1": 0.0952, "leaf_P": 0.125, "leaf_R": 0.1667, "leaf_F1": 0.1429, "n_leaf_sel": 8, "n_leaf_gt": 6, "ret_P": 0.0517, "sel_given_ret": 0.1111, "over_sel": 0.62, "why": {"explicit": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 2, "calls_exhausted_retries": 1, "attempts_total": 6, "attempt_errors": 4, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 53, "attempts_by_n_local": {"60": {"attempts": 4, "parse_ok": 1, "parse_fail": 0, "errors": 3}, "48": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}}, "attempt_failure_rate": 0.6666666666666666, "call_exhaustion_rate": 0.3333333333333333}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.125, "gen_R": 0.0769, "gen_F1": 0.0952, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothing", "darkness", "group", "light", "solo", "threatening"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothing", "darkness", "group", "light", "solo", "text", "threatening"], "stage3_selected": ["darkness", "light", "threatening"], "stage3_selected_scores": {"light": 0.7785, "threatening": 0.5582, "darkness": 0.8348}, "stage3_selected_ranks": {"light": 4, "threatening": 72, "darkness": 2}, "stage3_selected_phrase_ranks": {"light": 1, "threatening": 4, "darkness": 1}, "extra_evidence": {"anthro": {"source": "probe"}, "clothing": {"source": "probe"}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8348}, "group": {"source": "structural"}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7785}, "solo": {"source": "structural"}, "threatening": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5582}}, "structural": ["solo", "group", "text"], "probe": ["clothing", "anthro", "text", "group"], "t1": 4.86, "t2": 1.62, "t3": 41.08, "t3s": 1.97, "t3p": 2.29, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=168 entity=4 copyright_filtered=2 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"other\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"strong_implied\"}, {\"i\": 6, \"why\": \"style_or_meta\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"style_or_meta\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"style_or_meta\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"strong_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"style_or_meta\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.34.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.34.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"other\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"strong_implied\"}, {\"i\": 6, \"why\": \"style_or_meta\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"style_or_meta\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"style_or_meta\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"strong_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"style_or_meta\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"strong_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"style_or_meta\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"style_or_meta\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"strong_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"style_or_meta\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"other\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"other\"}, {\"i\": 49}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 49}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: gave up after 3 attempts", "Stage3 general_chunk_2: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.33.why\n Field required [type=missing, input_value={'i': 34}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 208, "n_selected": 49, "n_implied": 8, "n_structural": 3, "n_probe": 3, "ret_R": 0.6667, "P": 0.2245, "R": 0.7333, "F1": 0.3438, "leaf_P": 0.1667, "leaf_R": 0.5, "leaf_F1": 0.25, "n_leaf_sel": 36, "n_leaf_gt": 12, "ret_P": 0.0481, "sel_given_ret": 1.1, "over_sel": 3.27, "why": {"explicit": 38}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 4, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 91, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}, "31": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2245, "gen_R": 0.7333, "gen_F1": 0.3438, "missed": ["angry", "eyes_closed", "eyeshadow", "sleeping"], "extra": ["3rd_party_watermark", "annoyed_expression", "anthro", "applying_makeup", "artist_logo", "bed_sheet", "bedding", "bedroom", "blanket", "clothing", "comic", "english_text", "expressions", "eyebrows", "eyeliner", "eyes", "green_eyebrows", "half-closed_eyes", "head_on_pillow", "humanoid", "letters", "logo", "looking_down_at_another", "lying_on_bed", "mascara", "narrowed_eyes", "on_bed", "pajamas", "personal_grooming", "purple_theme", "relaxed_expression", "resting", "romantic", "sleepover", "text_box", "under_blanket", "vase", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["3rd_party_watermark", "annoyed_expression", "anthro", "applying_makeup", "artist_logo", "bed", "bed_sheet", "bedding", "bedroom", "blanket", "blonde_hair", "blue_eyes", "clothing", "comic", "duo", "english_text", "expressions", "eyebrows", "eyeliner", "eyes", "furniture", "green_eyebrows", "green_eyes", "hair", "half-closed_eyes", "head_on_pillow", "humanoid", "letters", "logo", "looking_down_at_another", "lying", "lying_on_bed", "makeup", "mascara", "narrowed_eyes", "on_bed", "pajamas", "personal_grooming", "purple_hair", "purple_theme", "relaxed_expression", "resting", "romantic", "sleepover", "text", "text_box", "under_blanket", "vase", "watermark"], "stage3_selected": ["3rd_party_watermark", "annoyed_expression", "applying_makeup", "artist_logo", "bed_sheet", "bedding", "bedroom", "blanket", "blonde_hair", "blue_eyes", "comic", "english_text", "expressions", "eyeliner", "eyes", "green_eyebrows", "green_eyes", "hair", "half-closed_eyes", "head_on_pillow", "letters", "looking_down_at_another", "lying", "lying_on_bed", "makeup", "mascara", "pajamas", "purple_hair", "purple_theme", "relaxed_expression", "resting", "romantic", "sleepover", "text", "text_box", "under_blanket", "vase", "watermark"], "stage3_selected_scores": {"hair": 0.6031, "text": 0.6007, "blue_eyes": 0.6014, "lying": 0.4409, "green_eyes": 0.5989, "comic": 0.3867, "blonde_hair": 0.5986, "half-closed_eyes": 0.3951, "purple_hair": 0.5642, "makeup": 0.5965, "watermark": 0.6042, "bedroom": 0.4901, "romantic": 0.3813, "bedding": 0.3909, "bed_sheet": 0.3993, "blanket": 0.4205, "mascara": 0.4462, "eyeliner": 0.4454, "lying_on_bed": 0.4093, "artist_logo": 0.3933, "text_box": 0.3916, "pajamas": 0.4086, "purple_theme": 0.4555, "vase": 0.3521, "resting": 0.5034, "annoyed_expression": 0.7251, "expressions": 0.5439, "head_on_pillow": 0.3887, "green_eyebrows": 0.5014, "looking_down_at_another": 0.4491, "3rd_party_watermark": 0.398, "sleepover": 0.5269, "under_blanket": 0.4281, "letters": 0.3656, "applying_makeup": 0.473, "relaxed_expression": 0.5056, "eyes": 0.8951, "english_text": 0.4189}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "lying": 79, "green_eyes": 9, "comic": 152, "blonde_hair": 10, "half-closed_eyes": 140, "purple_hair": 14, "makeup": 11, "watermark": 4, "bedroom": 43, "romantic": 157, "bedding": 149, "bed_sheet": 134, "blanket": 98, "mascara": 74, "eyeliner": 76, "lying_on_bed": 116, "artist_logo": 143, "text_box": 146, "pajamas": 120, "purple_theme": 63, "vase": 187, "resting": 33, "annoyed_expression": 2, "expressions": 18, "head_on_pillow": 151, "green_eyebrows": 34, "looking_down_at_another": 69, "3rd_party_watermark": 135, "sleepover": 26, "under_blanket": 91, "letters": 176, "applying_makeup": 55, "relaxed_expression": 32, "eyes": 1, "english_text": 103}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "lying": 7, "green_eyes": 1, "comic": 9, "blonde_hair": 1, "half-closed_eyes": 10, "purple_hair": 1, "makeup": 1, "watermark": 1, "bedroom": 1, "romantic": 2, "bedding": 7, "bed_sheet": 5, "blanket": 7, "mascara": 9, "eyeliner": 10, "lying_on_bed": 4, "artist_logo": 7, "text_box": 8, "pajamas": 3, "purple_theme": 10, "vase": 8, "resting": 1, "annoyed_expression": 1, "expressions": 3, "head_on_pillow": 8, "green_eyebrows": 2, "looking_down_at_another": 5, "3rd_party_watermark": 3, "sleepover": 1, "under_blanket": 8, "letters": 9, "applying_makeup": 4, "relaxed_expression": 6, "eyes": 1, "english_text": 4}, "extra_evidence": {"3rd_party_watermark": {"source": "stage3", "why": "explicit", "retrieval_score": 0.398}, "annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "applying_makeup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.473}, "artist_logo": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3933}, "bed_sheet": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3993}, "bedding": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3909}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "blanket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4205}, "clothing": {"source": "implied"}, "comic": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3867}, "english_text": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4189}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5439}, "eyebrows": {"source": "implied"}, "eyeliner": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4454}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8951}, "green_eyebrows": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5014}, "half-closed_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3951}, "head_on_pillow": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3887}, "humanoid": {"source": "structural"}, "letters": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3656}, "logo": {"source": "implied"}, "looking_down_at_another": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4491}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4093}, "mascara": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4462}, "narrowed_eyes": {"source": "implied"}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4086}, "personal_grooming": {"source": "implied"}, "purple_theme": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4555}, "relaxed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5056}, "resting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5034}, "romantic": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3813}, "sleepover": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5269}, "text_box": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3916}, "under_blanket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4281}, "vase": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3521}, "watermark": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6042}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 3.8, "t2": 1.97, "t3": 26.17, "t3s": 0.65, "t3p": 5.66, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=211 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 185, "n_selected": 45, "n_implied": 17, "n_structural": 3, "n_probe": 3, "ret_R": 0.5, "P": 0.3556, "R": 0.7273, "F1": 0.4776, "leaf_P": 0.1818, "leaf_R": 0.3077, "leaf_F1": 0.2286, "n_leaf_sel": 22, "n_leaf_gt": 13, "ret_P": 0.0595, "sel_given_ret": 1.4545, "over_sel": 2.05, "why": {"explicit": 24}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 4, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 1, "dupe_indices_total": 0, "kept_total": 70, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}, "5": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3556, "gen_R": 0.7273, "gen_F1": 0.4776, "missed": ["canine", "fur", "holding_musical_instrument", "holding_object", "music", "spade_tail"], "extra": ["3_claws", "3_fingers", "4_claws", "acoustic_guitar", "blonde_hair", "bottomwear", "curled_hair", "denim", "denim_clothing", "flowing_hair", "hand_gesture", "jeans", "long_tail", "pants", "pastel_background", "percussion_instrument", "playing_guitar", "playing_music", "poster_(object)", "shirt", "shorts", "toe_claws", "topwear", "torn_bottomwear", "torn_jeans", "torn_pants", "torn_shirt", "torn_shorts", "torn_topwear"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["3_claws", "3_fingers", "4_claws", "acoustic_guitar", "anthro", "bass_guitar", "blonde_hair", "bottomwear", "canid", "claws", "clothed", "clothing", "curled_hair", "denim", "denim_clothing", "fingers", "flowing_hair", "guitar", "hair", "hand_gesture", "jeans", "long_tail", "mammal", "musical_instrument", "pants", "pastel_background", "percussion_instrument", "playing_guitar", "playing_music", "plucked_string_instrument", "poster_(object)", "shirt", "shorts", "solo", "string_instrument", "tail", "toe_claws", "topwear", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "torn_shirt", "torn_shorts", "torn_topwear"], "stage3_selected": ["3_claws", "3_fingers", "4_claws", "acoustic_guitar", "bass_guitar", "blonde_hair", "curled_hair", "flowing_hair", "hand_gesture", "long_tail", "pastel_background", "percussion_instrument", "playing_guitar", "plucked_string_instrument", "poster_(object)", "string_instrument", "tail", "toe_claws", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_shirt", "torn_shorts", "torn_topwear"], "stage3_selected_scores": {"tail": 0.5659, "blonde_hair": 0.382, "toe_claws": 0.4913, "torn_clothing": 0.4132, "long_tail": 0.4222, "3_fingers": 0.4085, "string_instrument": 0.8617, "torn_bottomwear": 0.4362, "plucked_string_instrument": 0.8658, "curled_hair": 0.3875, "torn_topwear": 0.3945, "torn_shirt": 0.4049, "playing_guitar": 0.9317, "torn_jeans": 0.4824, "percussion_instrument": 0.8503, "3_claws": 0.4377, "hand_gesture": 0.4013, "torn_shorts": 0.3996, "bass_guitar": 0.9118, "flowing_hair": 0.5669, "4_claws": 0.4516, "poster_(object)": 0.4455, "acoustic_guitar": 0.8654, "pastel_background": 0.5632}, "stage3_selected_ranks": {"tail": 21, "blonde_hair": 147, "toe_claws": 38, "torn_clothing": 112, "long_tail": 104, "3_fingers": 117, "string_instrument": 8, "torn_bottomwear": 85, "plucked_string_instrument": 6, "curled_hair": 142, "torn_topwear": 131, "torn_shirt": 119, "playing_guitar": 2, "torn_jeans": 44, "percussion_instrument": 9, "3_claws": 83, "hand_gesture": 121, "torn_shorts": 126, "bass_guitar": 3, "flowing_hair": 20, "4_claws": 70, "poster_(object)": 76, "acoustic_guitar": 7, "pastel_background": 22}, "stage3_selected_phrase_ranks": {"tail": 1, "blonde_hair": 6, "toe_claws": 2, "torn_clothing": 6, "long_tail": 5, "3_fingers": 8, "string_instrument": 7, "torn_bottomwear": 3, "plucked_string_instrument": 5, "curled_hair": 5, "torn_topwear": 10, "torn_shirt": 7, "playing_guitar": 1, "torn_jeans": 1, "percussion_instrument": 7, "3_claws": 7, "hand_gesture": 9, "torn_shorts": 8, "bass_guitar": 2, "flowing_hair": 1, "4_claws": 4, "poster_(object)": 1, "acoustic_guitar": 5, "pastel_background": 1}, "extra_evidence": {"3_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4377}, "3_fingers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4085}, "4_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4516}, "acoustic_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8654}, "blonde_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.382}, "bottomwear": {"source": "implied"}, "curled_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3875}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5669}, "hand_gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4013}, "jeans": {"source": "implied"}, "long_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4222}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5632}, "percussion_instrument": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8503}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9317}, "playing_music": {"source": "implied"}, "poster_(object)": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4455}, "shirt": {"source": "implied"}, "shorts": {"source": "implied"}, "toe_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4913}, "topwear": {"source": "implied"}, "torn_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4362}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4824}, "torn_pants": {"source": "implied"}, "torn_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4049}, "torn_shorts": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3996}, "torn_topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3945}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.75, "t2": 1.86, "t3": 26.63, "t3s": 0.95, "t3p": 1.28, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=185 entity=2 copyright_filtered=4 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_2: candidates (local indices):\n1. auburn hair\n2. pointing\n3. red sclera\n4. studded bracelet\n5. tail ring\n6. torn clothing\n7. ethiopian wolf\n8. inspired by formal art\n9. fire\n10. star-shaped background\n11. 3 claws\n12. wanted poster\n13. bulldog\n14. crosslegged pose\n15. angry expression\n16. 3 fingers\n17. big hair\n18. string instrument\n19. hair tie\n20. head horn\n21. ring\n22. holding tail\n23. torn shirt\n24. coywolf\n25. attack\n26. pun\n27. starry background\n28. long claws\n29. lined paper\n30. rottweiler\n31. lotus pose\n32. annoyed expression\n33. fingerpads\n34. hairclip\n35. percussion instrument\n36. hair dye\n37. demon humanoid\n38. drawstring\n39. big tail\n40. torn shorts\n41. coyote\n42. escape\n43. business attire\n44. sky background\n45. big claws\n46. warning sign\n47. fighting pose\n48. hand on chin\n49. wavy hair\n50. musical instrument\n51. blue hair\n52. short horn\n53. ear piercing\n54. scaly tail\n55. torn body\n56. abyssal wolf\n57. burning\n58. transparent background\n59. digitigrade\n60. contrapposto"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 151, "n_selected": 28, "n_implied": 14, "n_structural": 4, "n_probe": 3, "ret_R": 0.6, "P": 0.5714, "R": 0.64, "F1": 0.6038, "leaf_P": 0.3846, "leaf_R": 0.3333, "leaf_F1": 0.3571, "n_leaf_sel": 13, "n_leaf_gt": 15, "ret_P": 0.0993, "sel_given_ret": 1.0667, "over_sel": 1.12, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 89, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 2, "parse_fail": 0, "errors": 1}, "31": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.25, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5714, "gen_R": 0.64, "gen_F1": 0.6038, "missed": ["claws", "crossed_arms", "facial_markings", "fur", "head_markings", "looking_at_another", "markings", "overalls", "standing"], "extra": ["actual_fur", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "corsac_fox", "looking_at_viewer", "t-shirt", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["actual_fur", "anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "bottomwear", "canid", "canine", "clothed", "clothing", "corsac_fox", "duo", "fox", "grey_background", "lagomorph", "leporid", "looking_at_viewer", "mammal", "pants", "rabbit", "shirt", "t-shirt", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["actual_fur", "black_pants", "blue_clothing", "blue_overalls", "corsac_fox", "fox", "grey_background", "rabbit", "simple_background", "t-shirt", "white_shirt"], "stage3_selected_scores": {"simple_background": 0.416, "fox": 0.638, "rabbit": 0.6511, "grey_background": 0.6784, "blue_clothing": 0.6538, "t-shirt": 0.724, "white_shirt": 0.8197, "black_pants": 0.833, "corsac_fox": 0.4193, "blue_overalls": 0.9203, "actual_fur": 0.4837}, "stage3_selected_ranks": {"simple_background": 153, "fox": 46, "rabbit": 41, "grey_background": 30, "blue_clothing": 39, "t-shirt": 22, "white_shirt": 4, "black_pants": 3, "corsac_fox": 152, "blue_overalls": 1, "actual_fur": 137}, "stage3_selected_phrase_ranks": {"simple_background": 8, "fox": 1, "rabbit": 1, "grey_background": 1, "blue_clothing": 8, "t-shirt": 2, "white_shirt": 1, "black_pants": 1, "corsac_fox": 9, "blue_overalls": 1, "actual_fur": 10}, "extra_evidence": {"actual_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4837}, "black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.833}, "blue_clothing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6538}, "blue_overalls": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9203}, "corsac_fox": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4193}, "looking_at_viewer": {"source": "structural"}, "t-shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.724}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8197}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 1.59, "t2": 1.41, "t3": 44.32, "t3s": 1.83, "t3p": 6.61, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=151 entity=5 copyright_filtered=0 generic_char_to_general=0 unknown_type=3", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"style_or_meta\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"style_or_meta\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"style_or_meta\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"style_or_meta\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 43, \"why\": \"style_or_meta\"}, {\"i\": 44, \"why\": \"weak_implied\"}, {\"i\": 45}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.33.why\n Field required [type=missing, input_value={'i': 45}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 204, "n_selected": 86, "n_implied": 26, "n_structural": 4, "n_probe": 3, "ret_R": 0.6364, "P": 0.1163, "R": 0.9091, "F1": 0.2062, "leaf_P": 0.08, "leaf_R": 0.5714, "leaf_F1": 0.1404, "n_leaf_sel": 50, "n_leaf_gt": 7, "ret_P": 0.0343, "sel_given_ret": 1.4286, "over_sel": 7.82, "why": {"explicit": 52, "strong_implied": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 4, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 108, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}, "25": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1163, "gen_R": 0.9091, "gen_F1": 0.2062, "missed": ["purple_body"], "extra": ["abs", "action_pose", "animal_humanoid", "anthro", "belly", "big_eyes", "blue_fingers", "blue_pawpads", "blue_paws", "blue_stripes", "blue_toes", "blue_tuft", "bovid", "bovid_humanoid", "canid_humanoid", "canine_humanoid", "caprine", "caprine_humanoid", "clothed", "clothing", "curved_tail", "expression_sheet", "facial_markings", "facial_stripes", "fennec_humanoid", "fluffy_fur", "fox_humanoid", "goat_humanoid", "head_markings", "heterochromia", "humanoid", "jumper", "jumping", "light_tail", "male", "male_humanoid", "mammal_humanoid", "markings", "mouth_closed", "multicolored_body", "multicolored_fur", "multicolored_tail", "muscular", "muscular_male", "open_smile", "pawpads", "pig_humanoid", "pink_ears", "pink_stripes", "pink_tail", "pink_tongue", "pose", "posed", "purple_belly", "smile", "striped_face", "striped_neck", "stripes", "suid", "suid_humanoid", "suina", "suina_humanoid", "tail", "tailed_humanoid", "tanuki_humanoid", "thin_tail", "tongue", "tongue_out", "two_tone_body", "two_tone_fur", "two_tone_tail", "walking", "wavy_tail", "white_tail", "wolf_humanoid", "x_eyes"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["abs", "action_pose", "animal_humanoid", "anthro", "belly", "big_eyes", "blue_eyes", "blue_fingers", "blue_nose", "blue_pawpads", "blue_paws", "blue_stripes", "blue_toes", "blue_tuft", "bovid", "bovid_humanoid", "canid", "canid_humanoid", "canine", "canine_humanoid", "caprine", "caprine_humanoid", "clothed", "clothing", "curved_tail", "expression_sheet", "facial_markings", "facial_stripes", "fennec_humanoid", "fluffy_fur", "fox_humanoid", "fur", "goat_humanoid", "head_markings", "heterochromia", "humanoid", "jumper", "jumping", "light_tail", "male", "male_humanoid", "mammal", "mammal_humanoid", "markings", "mouth_closed", "multicolored_body", "multicolored_fur", "multicolored_tail", "muscular", "muscular_male", "open_mouth", "open_smile", "pawpads", "pig_humanoid", "pink_ears", "pink_stripes", "pink_tail", "pink_tongue", "pose", "posed", "purple_belly", "smile", "solo", "striped_face", "striped_neck", "stripes", "suid", "suid_humanoid", "suina", "suina_humanoid", "tail", "tailed_humanoid", "tanuki_humanoid", "thin_tail", "tongue", "tongue_out", "two_tone_body", "two_tone_fur", "two_tone_tail", "walking", "wavy_tail", "white_body", "white_fur", "white_tail", "wolf_humanoid", "x_eyes"], "stage3_selected": ["abs", "action_pose", "big_eyes", "blue_eyes", "blue_fingers", "blue_nose", "blue_pawpads", "blue_paws", "blue_stripes", "blue_toes", "blue_tuft", "canine_humanoid", "curved_tail", "expression_sheet", "facial_stripes", "fennec_humanoid", "fluffy_fur", "goat_humanoid", "heterochromia", "jumper", "jumping", "light_tail", "male_humanoid", "mouth_closed", "multicolored_fur", "multicolored_tail", "muscular", "muscular_male", "open_mouth", "open_smile", "pig_humanoid", "pink_ears", "pink_stripes", "pink_tail", "pink_tongue", "pose", "posed", "purple_belly", "striped_face", "striped_neck", "suina_humanoid", "tail", "tailed_humanoid", "tanuki_humanoid", "thin_tail", "tongue_out", "two_tone_fur", "two_tone_tail", "walking", "wavy_tail", "white_body", "white_fur", "white_tail", "wolf_humanoid", "x_eyes"], "stage3_selected_scores": {"open_mouth": 0.6008, "tail": 0.6107, "white_body": 0.4875, "tongue_out": 0.3536, "blue_eyes": 0.5995, "white_fur": 0.5995, "muscular": 0.3548, "muscular_male": 0.3102, "multicolored_fur": 0.4995, "abs": 0.3223, "two_tone_fur": 0.4901, "open_smile": 0.4868, "multicolored_tail": 0.4718, "pink_tongue": 0.4215, "canine_humanoid": 0.9003, "white_tail": 0.5202, "heterochromia": 0.4423, "two_tone_tail": 0.5197, "blue_nose": 0.6032, "big_eyes": 0.4207, "mouth_closed": 0.5218, "walking": 0.3534, "pink_tail": 0.5444, "blue_pawpads": 0.4891, "wolf_humanoid": 0.819, "pink_ears": 0.5255, "blue_stripes": 0.6748, "thin_tail": 0.5604, "x_eyes": 0.3999, "goat_humanoid": 0.5534, "blue_paws": 0.4986, "light_tail": 0.5671, "striped_face": 0.5807, "tanuki_humanoid": 0.7574, "expression_sheet": 0.4555, "pink_stripes": 0.682, "tailed_humanoid": 0.5525, "wavy_tail": 0.5224, "fluffy_fur": 0.5593, "curved_tail": 0.637, "male_humanoid": 0.5627, "purple_belly": 0.5454, "striped_neck": 0.5948, "blue_fingers": 0.5077, "blue_toes": 0.5148, "suina_humanoid": 0.563, "blue_tuft": 0.5037, "facial_stripes": 0.5968, "pig_humanoid": 0.5894, "fennec_humanoid": 0.7741, "posed": 0.4484, "jumper": 0.4077, "pose": 0.6199, "action_pose": 0.617, "jumping": 0.6014}, "stage3_selected_ranks": {"open_mouth": 28, "tail": 23, "white_body": 114, "tongue_out": 194, "blue_eyes": 29, "white_fur": 30, "muscular": 193, "muscular_male": 202, "multicolored_fur": 100, "abs": 199, "two_tone_fur": 108, "open_smile": 116, "multicolored_tail": 131, "pink_tongue": 166, "canine_humanoid": 1, "white_tail": 82, "heterochromia": 158, "two_tone_tail": 83, "blue_nose": 26, "big_eyes": 167, "mouth_closed": 81, "walking": 195, "pink_tail": 62, "blue_pawpads": 109, "wolf_humanoid": 4, "pink_ears": 75, "blue_stripes": 14, "thin_tail": 53, "x_eyes": 177, "goat_humanoid": 58, "blue_paws": 102, "light_tail": 48, "striped_face": 43, "tanuki_humanoid": 8, "expression_sheet": 147, "pink_stripes": 13, "tailed_humanoid": 59, "wavy_tail": 79, "fluffy_fur": 54, "curved_tail": 16, "male_humanoid": 52, "purple_belly": 61, "striped_neck": 35, "blue_fingers": 93, "blue_toes": 87, "suina_humanoid": 51, "blue_tuft": 96, "facial_stripes": 33, "pig_humanoid": 37, "fennec_humanoid": 6, "posed": 154, "jumper": 173, "pose": 18, "action_pose": 19, "jumping": 27}, "stage3_selected_phrase_ranks": {"open_mouth": 1, "tail": 1, "white_body": 8, "tongue_out": 10, "blue_eyes": 1, "white_fur": 1, "muscular": 6, "muscular_male": 8, "multicolored_fur": 6, "abs": 7, "two_tone_fur": 7, "open_smile": 2, "multicolored_tail": 9, "pink_tongue": 3, "canine_humanoid": 1, "white_tail": 7, "heterochromia": 6, "two_tone_tail": 8, "blue_nose": 1, "big_eyes": 6, "mouth_closed": 3, "walking": 10, "pink_tail": 1, "blue_pawpads": 10, "wolf_humanoid": 4, "pink_ears": 4, "blue_stripes": 1, "thin_tail": 5, "x_eyes": 9, "goat_humanoid": 8, "blue_paws": 9, "light_tail": 4, "striped_face": 5, "tanuki_humanoid": 7, "expression_sheet": 8, "pink_stripes": 1, "tailed_humanoid": 9, "wavy_tail": 7, "fluffy_fur": 2, "curved_tail": 1, "male_humanoid": 7, "purple_belly": 4, "striped_neck": 4, "blue_fingers": 5, "blue_toes": 4, "suina_humanoid": 6, "blue_tuft": 7, "facial_stripes": 3, "pig_humanoid": 3, "fennec_humanoid": 6, "posed": 9, "jumper": 3, "pose": 1, "action_pose": 1, "jumping": 1}, "extra_evidence": {"abs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3223}, "action_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.617}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "belly": {"source": "implied"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4207}, "blue_fingers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5077}, "blue_pawpads": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4891}, "blue_paws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4986}, "blue_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6748}, "blue_toes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5148}, "blue_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5037}, "bovid": {"source": "implied"}, "bovid_humanoid": {"source": "implied"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9003}, "caprine": {"source": "implied"}, "caprine_humanoid": {"source": "implied"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.637}, "expression_sheet": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4555}, "facial_markings": {"source": "implied"}, "facial_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5968}, "fennec_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7741}, "fluffy_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5593}, "fox_humanoid": {"source": "implied"}, "goat_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5534}, "head_markings": {"source": "implied"}, "heterochromia": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4423}, "humanoid": {"source": "implied"}, "jumper": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4077}, "jumping": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6014}, "light_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5671}, "male": {"source": "structural"}, "male_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5627}, "mammal_humanoid": {"source": "implied"}, "markings": {"source": "implied"}, "mouth_closed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5218}, "multicolored_body": {"source": "implied"}, "multicolored_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4995}, "multicolored_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4718}, "muscular": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3548}, "muscular_male": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3102}, "open_smile": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4868}, "pawpads": {"source": "implied"}, "pig_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5894}, "pink_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5255}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.682}, "pink_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5444}, "pink_tongue": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4215}, "pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6199}, "posed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4484}, "purple_belly": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5454}, "smile": {"source": "implied"}, "striped_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5807}, "striped_neck": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5948}, "stripes": {"source": "implied"}, "suid": {"source": "implied"}, "suid_humanoid": {"source": "implied"}, "suina": {"source": "implied"}, "suina_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.563}, "tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6107}, "tailed_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5525}, "tanuki_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7574}, "thin_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5604}, "tongue": {"source": "implied"}, "tongue_out": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3536}, "two_tone_body": {"source": "implied"}, "two_tone_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "two_tone_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5197}, "walking": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3534}, "wavy_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5224}, "white_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5202}, "wolf_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.819}, "x_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3999}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 1.31, "t2": 2.03, "t3": 85.25, "t3s": 1.97, "t3p": 0.96, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=205 entity=4 copyright_filtered=2 generic_char_to_general=0 unknown_type=5"]}
data/eval_results/latency_baseline_seed43.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T06:02:06.240469", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 10, "temperature": 0.0, "shuffle": false, "seed": 43, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 27}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 142, "n_selected": 55, "n_implied": 22, "n_structural": 0, "n_probe": 5, "ret_R": 0.3333, "P": 0.1455, "R": 0.6667, "F1": 0.2388, "leaf_P": 0.08, "leaf_R": 0.2222, "leaf_F1": 0.1176, "n_leaf_sel": 25, "n_leaf_gt": 9, "ret_P": 0.0282, "sel_given_ret": 2.0, "over_sel": 4.58, "why": {"explicit": 29}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 53, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1455, "gen_R": 0.6667, "gen_F1": 0.2388, "missed": ["alpha_channel", "clothed", "fingers", "male"], "extra": ["beer_mug", "beverage", "black_body", "black_clothing", "black_fur", "black_shirt", "black_topwear", "bobcat", "business_attire", "business_suit", "businesswear", "coffee_cup", "coffee_mug", "container", "cup", "domestic_cat", "dress_shirt", "felis", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "lynx", "mug", "pockets", "shirt", "siamese", "suit", "suit_jacket", "sweater", "sweater_vest", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_dress_shirt", "white_shirt", "white_t-shirt", "white_topwear"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beer_mug", "beverage", "black_body", "black_clothing", "black_fur", "black_shirt", "black_topwear", "bobcat", "business_attire", "business_suit", "businesswear", "clothing", "coffee_cup", "coffee_mug", "container", "cup", "domestic_cat", "dress_shirt", "felid", "feline", "felis", "fur", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "lynx", "mammal", "mug", "pockets", "shirt", "siamese", "solo", "suit", "suit_jacket", "sweater", "sweater_vest", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_dress_shirt", "white_shirt", "white_t-shirt", "white_topwear"], "stage3_selected": ["anthro", "beer_mug", "black_fur", "black_shirt", "bobcat", "business_attire", "business_suit", "businesswear", "coffee_cup", "coffee_mug", "dress_shirt", "feline", "grey_shirt", "hair_bun", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "lynx", "pockets", "shirt", "siamese", "suit_jacket", "sweater_vest", "teal_shirt", "vest", "white_dress_shirt", "white_shirt", "white_t-shirt"], "stage3_selected_scores": {"anthro": 0.4929, "feline": 0.7061, "shirt": 0.7997, "black_fur": 0.7182, "lynx": 0.4992, "vest": 0.8403, "white_shirt": 0.738, "holding_container": 0.7598, "black_shirt": 0.7352, "dress_shirt": 0.7241, "pockets": 0.6589, "holding_cup": 0.7667, "hair_bun": 0.6926, "siamese": 0.6226, "holding_beverage": 0.772, "coffee_mug": 0.7055, "grey_shirt": 0.7582, "business_suit": 0.5745, "coffee_cup": 0.6863, "bobcat": 0.5768, "sweater_vest": 0.7532, "holding_mug": 0.9159, "beer_mug": 0.6598, "white_t-shirt": 0.7329, "suit_jacket": 0.6863, "businesswear": 0.5715, "white_dress_shirt": 0.6881, "business_attire": 0.5657, "teal_shirt": 0.7474}, "stage3_selected_ranks": {"anthro": 120, "feline": 44, "shirt": 5, "black_fur": 35, "lynx": 117, "vest": 3, "white_shirt": 22, "holding_container": 11, "black_shirt": 25, "dress_shirt": 33, "pockets": 60, "holding_cup": 10, "hair_bun": 51, "siamese": 71, "holding_beverage": 8, "coffee_mug": 45, "grey_shirt": 13, "business_suit": 85, "coffee_cup": 54, "bobcat": 82, "sweater_vest": 16, "holding_mug": 1, "beer_mug": 59, "white_t-shirt": 26, "suit_jacket": 53, "businesswear": 87, "white_dress_shirt": 52, "business_attire": 89, "teal_shirt": 19}, "stage3_selected_phrase_ranks": {"anthro": 9, "feline": 1, "shirt": 1, "black_fur": 1, "lynx": 6, "vest": 1, "white_shirt": 4, "holding_container": 5, "black_shirt": 8, "dress_shirt": 2, "pockets": 3, "holding_cup": 4, "hair_bun": 1, "siamese": 5, "holding_beverage": 3, "coffee_mug": 3, "grey_shirt": 1, "business_suit": 8, "coffee_cup": 4, "bobcat": 6, "sweater_vest": 3, "holding_mug": 1, "beer_mug": 5, "white_t-shirt": 3, "suit_jacket": 4, "businesswear": 9, "white_dress_shirt": 5, "business_attire": 1, "teal_shirt": 4}, "extra_evidence": {"beer_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6598}, "beverage": {"source": "implied"}, "black_body": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7182}, "black_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7352}, "black_topwear": {"source": "implied"}, "bobcat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5768}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5657}, "business_suit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5745}, "businesswear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5715}, "coffee_cup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6863}, "coffee_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7055}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "domestic_cat": {"source": "implied"}, "dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7241}, "felis": {"source": "implied"}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6926}, "holding_beverage": {"source": "stage3", "why": "explicit", "retrieval_score": 0.772}, "holding_container": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7598}, "holding_cup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7667}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9159}, "holding_object": {"source": "implied"}, "lynx": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4992}, "mug": {"source": "implied"}, "pockets": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6589}, "shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7997}, "siamese": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6226}, "suit": {"source": "implied"}, "suit_jacket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6863}, "sweater": {"source": "implied"}, "sweater_vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7532}, "t-shirt": {"source": "implied"}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8403}, "white_clothing": {"source": "implied"}, "white_dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6881}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.738}, "white_t-shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7329}, "white_topwear": {"source": "implied"}}, "structural": [], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 3.66, "t2": 1.76, "t3": 34.39, "t3s": 5.18, "t3p": 12.81, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=141 entity=1 copyright_filtered=1 generic_char_to_general=0 unknown_type=1"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 160, "n_selected": 26, "n_implied": 4, "n_structural": 6, "n_probe": 6, "ret_R": 0.5714, "P": 0.3846, "R": 0.7143, "F1": 0.5, "leaf_P": 0.125, "leaf_R": 0.2, "leaf_F1": 0.1538, "n_leaf_sel": 16, "n_leaf_gt": 10, "ret_P": 0.05, "sel_given_ret": 1.25, "over_sel": 1.86, "why": {"explicit": 10, "strong_implied": 4}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 69, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "41": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3846, "gen_R": 0.7143, "gen_F1": 0.5, "missed": ["fur", "hair", "human", "male"], "extra": ["anthro", "belly_dancer_outfit", "bottomwear", "dancewear", "duo", "gorilla", "grinning_at_viewer", "humanoid", "loincloth", "monkey", "raised_arms", "spread_arms", "sun_bear", "topless", "trio", "ursine"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "belly_dancer_outfit", "bottomwear", "clothed", "clothing", "dancewear", "dancing", "duo", "gorilla", "grinning_at_viewer", "group", "haplorhine", "humanoid", "loincloth", "looking_at_viewer", "mammal", "monkey", "primate", "raised_arms", "spread_arms", "sun_bear", "topless", "trio", "ursine"], "stage3_selected": ["ape", "bear", "belly_dancer_outfit", "dancewear", "dancing", "gorilla", "grinning_at_viewer", "loincloth", "looking_at_viewer", "monkey", "primate", "raised_arms", "spread_arms", "sun_bear"], "stage3_selected_scores": {"looking_at_viewer": 0.5483, "bear": 0.5736, "primate": 0.8904, "loincloth": 0.5697, "dancing": 0.5576, "ape": 0.9767, "raised_arms": 0.5461, "dancewear": 0.3475, "sun_bear": 0.4334, "belly_dancer_outfit": 0.3547, "monkey": 0.7558, "gorilla": 0.8299, "spread_arms": 0.403, "grinning_at_viewer": 0.4425}, "stage3_selected_ranks": {"looking_at_viewer": 18, "bear": 12, "primate": 2, "loincloth": 13, "dancing": 16, "ape": 1, "raised_arms": 19, "dancewear": 127, "sun_bear": 51, "belly_dancer_outfit": 119, "monkey": 6, "gorilla": 4, "spread_arms": 77, "grinning_at_viewer": 42}, "stage3_selected_phrase_ranks": {"looking_at_viewer": 1, "bear": 1, "primate": 1, "loincloth": 1, "dancing": 1, "ape": 1, "raised_arms": 1, "dancewear": 8, "sun_bear": 10, "belly_dancer_outfit": 7, "monkey": 6, "gorilla": 2, "spread_arms": 5, "grinning_at_viewer": 3}, "extra_evidence": {"anthro": {"source": "structural"}, "belly_dancer_outfit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3547}, "bottomwear": {"source": "implied"}, "dancewear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3475}, "duo": {"source": "probe"}, "gorilla": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8299}, "grinning_at_viewer": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4425}, "humanoid": {"source": "structural"}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5697}, "monkey": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7558}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5461}, "spread_arms": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.403}, "sun_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4334}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "ursine": {"source": "implied"}}, "structural": ["trio", "anthro", "humanoid", "clothed", "topless", "looking_at_viewer"], "probe": ["clothing", "simple_background", "anthro", "duo", "group", "bear"], "t1": 2.77, "t2": 3.88, "t3": 30.11, "t3s": 5.47, "t3p": 6.35, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=161 entity=5 copyright_filtered=2 generic_char_to_general=1 unknown_type=3"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 133, "n_selected": 34, "n_implied": 11, "n_structural": 3, "n_probe": 5, "ret_R": 0.7143, "P": 0.3529, "R": 0.8571, "F1": 0.5, "leaf_P": 0.2857, "leaf_R": 0.6667, "leaf_F1": 0.4, "n_leaf_sel": 21, "n_leaf_gt": 9, "ret_P": 0.0752, "sel_given_ret": 1.2, "over_sel": 2.43, "why": {"explicit": 16, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 67, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "11": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3529, "gen_R": 0.8571, "gen_F1": 0.5, "missed": ["romantic", "romantic_couple"], "extra": ["<3", "blue_hair", "coat", "confident", "domestic_rabbit", "dutch_rabbit", "expressions", "fur_coat", "hair", "hand_holding", "holding_object", "holding_plushie", "holding_toy", "lab_coat", "oryctolagus", "relationship", "round_ears", "setting", "teddy_bear", "topwear", "toy", "winter_coat"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blue_hair", "blush", "clothed", "clothing", "coat", "confident", "domestic_rabbit", "duo", "dutch_rabbit", "expressions", "fur_coat", "hair", "hand_holding", "holding_object", "holding_plushie", "holding_toy", "lab_coat", "lagomorph", "leporid", "mammal", "oryctolagus", "plushie", "rabbit", "relationship", "round_ears", "setting", "teal_eyes", "teddy_bear", "topwear", "toy", "winter_coat"], "stage3_selected": ["blue_eyes", "blue_hair", "coat", "confident", "duo", "dutch_rabbit", "expressions", "fur_coat", "hand_holding", "holding_plushie", "holding_toy", "lab_coat", "relationship", "round_ears", "setting", "teal_eyes", "teddy_bear", "winter_coat"], "stage3_selected_scores": {"duo": 0.3628, "blue_eyes": 0.615, "blue_hair": 0.4201, "coat": 0.6383, "teal_eyes": 0.6283, "lab_coat": 0.516, "teddy_bear": 0.5459, "confident": 0.5161, "expressions": 0.5454, "holding_plushie": 0.7793, "fur_coat": 0.4906, "winter_coat": 0.4759, "dutch_rabbit": 0.4583, "holding_toy": 0.5855, "relationship": 0.6206, "setting": 0.5567, "hand_holding": 0.4283, "round_ears": 0.4342}, "stage3_selected_ranks": {"duo": 131, "blue_eyes": 12, "blue_hair": 118, "coat": 7, "teal_eyes": 8, "lab_coat": 40, "teddy_bear": 22, "confident": 39, "expressions": 23, "holding_plushie": 2, "fur_coat": 53, "winter_coat": 66, "dutch_rabbit": 80, "holding_toy": 14, "relationship": 9, "setting": 20, "hand_holding": 112, "round_ears": 102}, "stage3_selected_phrase_ranks": {"duo": 3, "blue_eyes": 1, "blue_hair": 8, "coat": 1, "teal_eyes": 1, "lab_coat": 5, "teddy_bear": 5, "confident": 7, "expressions": 2, "holding_plushie": 1, "fur_coat": 9, "winter_coat": 10, "dutch_rabbit": 4, "holding_toy": 4, "relationship": 1, "setting": 1, "hand_holding": 9, "round_ears": 10}, "extra_evidence": {"<3": {"source": "probe"}, "blue_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4201}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "confident": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5161}, "domestic_rabbit": {"source": "implied"}, "dutch_rabbit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4583}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5454}, "fur_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4906}, "hair": {"source": "implied"}, "hand_holding": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4283}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "holding_toy": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5855}, "lab_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.516}, "oryctolagus": {"source": "implied"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "round_ears": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4342}, "setting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5567}, "teddy_bear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5459}, "topwear": {"source": "implied"}, "toy": {"source": "implied"}, "winter_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4759}}, "structural": ["duo", "anthro", "clothed"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 3.58, "t2": 1.8, "t3": 18.71, "t3s": 7.52, "t3p": 13.4, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=131 entity=2 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 117, "n_selected": 21, "n_implied": 4, "n_structural": 3, "n_probe": 4, "ret_R": 0.75, "P": 0.1905, "R": 1.0, "F1": 0.32, "leaf_P": 0.25, "leaf_R": 1.0, "leaf_F1": 0.4, "n_leaf_sel": 16, "n_leaf_gt": 4, "ret_P": 0.0256, "sel_given_ret": 1.3333, "over_sel": 5.25, "why": {"explicit": 14}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 29, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1905, "gen_R": 1.0, "gen_F1": 0.32, "missed": [], "extra": ["anthro", "big_eyes", "clothed", "clothing", "elemental_creature", "floating", "gem", "gem_creature", "glistening", "glistening_nose", "mineral_fauna", "nose", "sparkling_background", "spots", "spotted_legs", "toothy_smile", "wide_eyed"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["anthro", "big_eyes", "clothed", "clothing", "elemental_creature", "floating", "gem", "gem_creature", "glistening", "glistening_nose", "mineral_fauna", "nose", "red_nose", "smile", "solo", "sparkling_background", "spots", "spotted_legs", "tan_body", "toothy_smile", "wide_eyed"], "stage3_selected": ["big_eyes", "floating", "gem_creature", "glistening_nose", "nose", "red_nose", "smile", "sparkling_background", "spots", "spotted_legs", "tan_body", "toothy_smile", "white_background", "wide_eyed"], "stage3_selected_scores": {"smile": 0.5956, "white_background": 0.6072, "tan_body": 0.6582, "spots": 0.6224, "wide_eyed": 0.4482, "big_eyes": 0.6934, "red_nose": 0.7475, "floating": 0.6454, "glistening_nose": 0.5913, "spotted_legs": 0.6492, "gem_creature": 0.4594, "sparkling_background": 0.4258, "toothy_smile": 0.4302, "nose": 0.8611}, "stage3_selected_ranks": {"smile": 36, "white_background": 32, "tan_body": 15, "spots": 27, "wide_eyed": 91, "big_eyes": 7, "red_nose": 3, "floating": 20, "glistening_nose": 38, "spotted_legs": 16, "gem_creature": 85, "sparkling_background": 106, "toothy_smile": 101, "nose": 2}, "stage3_selected_phrase_ranks": {"smile": 2, "white_background": 1, "tan_body": 6, "spots": 7, "wide_eyed": 8, "big_eyes": 1, "red_nose": 1, "floating": 1, "glistening_nose": 4, "spotted_legs": 5, "gem_creature": 5, "sparkling_background": 7, "toothy_smile": 10, "nose": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6934}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "elemental_creature": {"source": "implied"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6454}, "gem": {"source": "implied"}, "gem_creature": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4594}, "glistening": {"source": "implied"}, "glistening_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5913}, "mineral_fauna": {"source": "implied"}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8611}, "sparkling_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4258}, "spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6224}, "spotted_legs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6492}, "toothy_smile": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4302}, "wide_eyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4482}}, "structural": ["solo", "anthro", "clothed"], "probe": ["clothing", "simple_background", "anthro", "solo"], "t1": 3.57, "t2": 1.49, "t3": 8.55, "t3s": 5.0, "t3p": 13.39, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=120 entity=0 copyright_filtered=2 generic_char_to_general=0 unknown_type=4"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 184, "n_selected": 40, "n_implied": 13, "n_structural": 5, "n_probe": 3, "ret_R": 0.4091, "P": 0.375, "R": 0.6818, "F1": 0.4839, "leaf_P": 0.1923, "leaf_R": 0.4167, "leaf_F1": 0.2632, "n_leaf_sel": 26, "n_leaf_gt": 12, "ret_P": 0.0489, "sel_given_ret": 1.6667, "over_sel": 1.82, "why": {"explicit": 18, "strong_implied": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 3, "calls_exhausted_retries": 1, "attempts_total": 6, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 1, "dupe_indices_total": 0, "kept_total": 57, "attempts_by_n_local": {"60": {"attempts": 5, "parse_ok": 2, "parse_fail": 0, "errors": 3}, "1": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.5, "call_exhaustion_rate": 0.25}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.375, "gen_R": 0.6818, "gen_F1": 0.4839, "missed": ["chest_tuft", "countershading", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger"], "extra": ["action_pose", "back_muscles", "big_biceps", "cheek_tuft", "countershade_body", "eyes", "facial_tuft", "flexing", "flexing_both_biceps", "full-length_portrait", "gesture", "heterochromia", "light_hands", "pattern_clothing", "pattern_topwear", "portrait", "pose", "smile", "smiling_at_viewer", "striped_body", "striped_ears", "striped_fur", "suggestive_pose", "topwear", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["action_pose", "anthro", "back_muscles", "big_biceps", "blue_eyes", "bottomwear", "cheek_tuft", "clothed", "clothing", "countershade_body", "eyes", "facial_tuft", "felid", "flexing", "flexing_both_biceps", "full-length_portrait", "fur", "gesture", "hand_on_head", "heterochromia", "light_hands", "male", "mammal", "pattern_clothing", "pattern_topwear", "portrait", "pose", "shorts", "smile", "smiling_at_viewer", "solo", "striped_body", "striped_ears", "striped_fur", "stripes", "suggestive_pose", "topless", "topwear", "tuft", "white_chest"], "stage3_selected": ["action_pose", "back_muscles", "big_biceps", "blue_eyes", "cheek_tuft", "countershade_body", "eyes", "flexing", "flexing_both_biceps", "full-length_portrait", "gesture", "hand_on_head", "heterochromia", "light_hands", "pattern_topwear", "shorts", "smiling_at_viewer", "striped_ears", "striped_fur", "suggestive_pose", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5739, "shorts": 0.5818, "cheek_tuft": 0.472, "gesture": 0.5883, "full-length_portrait": 0.4425, "striped_fur": 0.6411, "heterochromia": 0.4, "hand_on_head": 0.5952, "flexing": 0.5536, "back_muscles": 0.5889, "action_pose": 0.4747, "big_biceps": 0.6943, "white_chest": 0.9204, "striped_ears": 0.4514, "countershade_body": 0.8758, "light_hands": 0.8131, "flexing_both_biceps": 0.5644, "eyes": 0.9807, "smiling_at_viewer": 0.4503, "pattern_topwear": 0.4408, "suggestive_pose": 0.6259}, "stage3_selected_ranks": {"blue_eyes": 63, "shorts": 62, "cheek_tuft": 116, "gesture": 59, "full-length_portrait": 140, "striped_fur": 36, "heterochromia": 162, "hand_on_head": 53, "flexing": 68, "back_muscles": 58, "action_pose": 113, "big_biceps": 24, "white_chest": 2, "striped_ears": 133, "countershade_body": 3, "light_hands": 13, "flexing_both_biceps": 65, "eyes": 1, "smiling_at_viewer": 134, "pattern_topwear": 142, "suggestive_pose": 43}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "shorts": 1, "cheek_tuft": 8, "gesture": 1, "full-length_portrait": 9, "striped_fur": 2, "heterochromia": 8, "hand_on_head": 2, "flexing": 10, "back_muscles": 7, "action_pose": 7, "big_biceps": 7, "white_chest": 1, "striped_ears": 1, "countershade_body": 1, "light_hands": 10, "flexing_both_biceps": 8, "eyes": 1, "smiling_at_viewer": 8, "pattern_topwear": 10, "suggestive_pose": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4747}, "back_muscles": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5889}, "big_biceps": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6943}, "cheek_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.472}, "countershade_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8758}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9807}, "facial_tuft": {"source": "implied"}, "flexing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5536}, "flexing_both_biceps": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5644}, "full-length_portrait": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4425}, "gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5883}, "heterochromia": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4}, "light_hands": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8131}, "pattern_clothing": {"source": "implied"}, "pattern_topwear": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4408}, "portrait": {"source": "implied"}, "pose": {"source": "implied"}, "smile": {"source": "implied"}, "smiling_at_viewer": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4503}, "striped_body": {"source": "implied"}, "striped_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4514}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6411}, "suggestive_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6259}, "topwear": {"source": "implied"}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9204}}, "structural": ["solo", "anthro", "male", "clothed", "topless"], "probe": ["anthro", "felid", "solo"], "t1": 2.06, "t2": 1.77, "t3": 63.87, "t3s": 1.99, "t3p": 3.52, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=181 entity=2 copyright_filtered=2 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: gave up after 3 attempts"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 190, "n_selected": 23, "n_implied": 6, "n_structural": 5, "n_probe": 4, "ret_R": 0.6154, "P": 0.3913, "R": 0.6923, "F1": 0.5, "leaf_P": 0.2, "leaf_R": 0.5, "leaf_F1": 0.2857, "n_leaf_sel": 15, "n_leaf_gt": 6, "ret_P": 0.0421, "sel_given_ret": 1.125, "over_sel": 1.77, "why": {"explicit": 7, "strong_implied": 5}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 5, "attempt_errors": 2, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 98, "attempts_by_n_local": {"60": {"attempts": 4, "parse_ok": 2, "parse_fail": 0, "errors": 2}, "58": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.4, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3913, "gen_R": 0.6923, "gen_F1": 0.5, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["agamid", "anthro", "clothed", "clothing", "dark_theme", "darkness", "frilled_lizard", "group", "guardian", "light", "male_human", "mask", "note", "taur"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["agamid", "anthro", "bovid", "caprine", "clothed", "clothing", "dark_theme", "darkness", "frilled_lizard", "goat", "group", "guardian", "human", "light", "lizard", "male_human", "mammal", "mask", "note", "reptile", "scalie", "taur", "text"], "stage3_selected": ["dark_theme", "darkness", "frilled_lizard", "goat", "guardian", "human", "light", "lizard", "male_human", "mask", "note", "text"], "stage3_selected_scores": {"text": 0.3659, "mask": 0.3726, "light": 0.5823, "darkness": 0.5976, "dark_theme": 0.3993, "note": 0.5657, "guardian": 0.3706, "human": 0.5571, "lizard": 0.5942, "goat": 0.5776, "frilled_lizard": 0.458, "male_human": 0.4223}, "stage3_selected_ranks": {"text": 127, "mask": 114, "light": 6, "darkness": 2, "dark_theme": 88, "note": 11, "guardian": 118, "human": 12, "lizard": 3, "goat": 8, "frilled_lizard": 29, "male_human": 65}, "stage3_selected_phrase_ranks": {"text": 8, "mask": 3, "light": 1, "darkness": 1, "dark_theme": 4, "note": 1, "guardian": 3, "human": 1, "lizard": 1, "goat": 1, "frilled_lizard": 2, "male_human": 2}, "extra_evidence": {"agamid": {"source": "implied"}, "anthro": {"source": "structural"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "dark_theme": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3993}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5976}, "frilled_lizard": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.458}, "group": {"source": "structural"}, "guardian": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3706}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5823}, "male_human": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4223}, "mask": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3726}, "note": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5657}, "taur": {"source": "structural"}}, "structural": ["group", "anthro", "taur", "clothed", "text"], "probe": ["clothing", "simple_background", "anthro", "text"], "t1": 2.21, "t2": 1.75, "t3": 62.45, "t3s": 1.02, "t3p": 4.36, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=178 entity=4 copyright_filtered=8 generic_char_to_general=0 unknown_type=1", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"other\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"other\"}, {\"i\": 9, \"why\": \"other\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"other\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"other\"}, {\"i\": 40, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 45, \"why\": \"other\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 51, \"why\": \"explicit\"}, {\"i\": 52, \"why\": \"other\"}, {\"i\": 55, \"why\": \"explicit\"}, {\"i\": 56}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.38.why\n Field required [type=missing, input_value={'i': 56}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"other\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"other\"}, {\"i\": 9, \"why\": \"other\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"other\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"other\"}, {\"i\": 40, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 45, \"why\": \"other\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 48, \"why\": \"explicit\"}, {\"i\": 49, \"why\": \"other\"}, {\"i\": 50}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.38.why\n Field required [type=missing, input_value={'i': 50}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 208, "n_selected": 46, "n_implied": 9, "n_structural": 3, "n_probe": 4, "ret_R": 0.6667, "P": 0.2826, "R": 0.8667, "F1": 0.4262, "leaf_P": 0.2059, "leaf_R": 0.5833, "leaf_F1": 0.3043, "n_leaf_sel": 34, "n_leaf_gt": 12, "ret_P": 0.0481, "sel_given_ret": 1.3, "over_sel": 3.07, "why": {"explicit": 32, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 4, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 93, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}, "31": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2826, "gen_R": 0.8667, "gen_F1": 0.4262, "missed": ["angry", "eyes_closed"], "extra": ["annoyed_expression", "anthro", "applying_makeup", "bed_sheet", "bedding", "bedroom", "blanket", "clothing", "english_text", "expressions", "eyes", "half-closed_eyes", "half-length_portrait", "head_on_pillow", "humanoid", "letters", "looking_down_at_another", "lying_on_bed", "mascara", "narrowed_eyes", "on_bed", "pajamas", "personal_grooming", "portrait", "purple_theme", "relaxed_expression", "resting", "sleeping_together", "sleepover", "text_box", "under_blanket", "vase", "yellow_eyeshadow"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "applying_makeup", "bed", "bed_sheet", "bedding", "bedroom", "blanket", "blonde_hair", "blue_eyes", "clothing", "duo", "english_text", "expressions", "eyes", "eyeshadow", "furniture", "green_eyes", "hair", "half-closed_eyes", "half-length_portrait", "head_on_pillow", "humanoid", "letters", "looking_down_at_another", "lying", "lying_on_bed", "makeup", "mascara", "narrowed_eyes", "on_bed", "pajamas", "personal_grooming", "portrait", "purple_hair", "purple_theme", "relaxed_expression", "resting", "sleeping", "sleeping_together", "sleepover", "text", "text_box", "under_blanket", "vase", "yellow_eyeshadow"], "stage3_selected": ["annoyed_expression", "applying_makeup", "bed_sheet", "bedding", "bedroom", "blanket", "blonde_hair", "blue_eyes", "english_text", "expressions", "eyes", "green_eyes", "hair", "half-closed_eyes", "half-length_portrait", "head_on_pillow", "letters", "looking_down_at_another", "lying_on_bed", "makeup", "mascara", "pajamas", "purple_hair", "purple_theme", "relaxed_expression", "resting", "sleeping", "sleeping_together", "sleepover", "text", "text_box", "under_blanket", "vase", "yellow_eyeshadow"], "stage3_selected_scores": {"hair": 0.6031, "text": 0.6007, "blue_eyes": 0.6014, "green_eyes": 0.5989, "blonde_hair": 0.5986, "half-closed_eyes": 0.3951, "purple_hair": 0.5642, "makeup": 0.5965, "bedroom": 0.4901, "sleeping": 0.6027, "bedding": 0.3909, "half-length_portrait": 0.4197, "bed_sheet": 0.3993, "blanket": 0.4205, "mascara": 0.4462, "lying_on_bed": 0.4093, "text_box": 0.3916, "purple_theme": 0.4555, "vase": 0.3521, "resting": 0.5034, "annoyed_expression": 0.7251, "expressions": 0.5439, "head_on_pillow": 0.3887, "sleeping_together": 0.5084, "sleepover": 0.5269, "under_blanket": 0.4281, "yellow_eyeshadow": 0.4551, "letters": 0.3656, "applying_makeup": 0.473, "relaxed_expression": 0.5056, "eyes": 0.8951, "english_text": 0.4189, "pajamas": 0.4086, "looking_down_at_another": 0.4491}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "green_eyes": 9, "blonde_hair": 10, "half-closed_eyes": 140, "purple_hair": 14, "makeup": 11, "bedroom": 43, "sleeping": 6, "bedding": 149, "half-length_portrait": 100, "bed_sheet": 134, "blanket": 98, "mascara": 74, "lying_on_bed": 116, "text_box": 146, "purple_theme": 63, "vase": 187, "resting": 33, "annoyed_expression": 2, "expressions": 18, "head_on_pillow": 151, "sleeping_together": 31, "sleepover": 26, "under_blanket": 91, "yellow_eyeshadow": 64, "letters": 176, "applying_makeup": 55, "relaxed_expression": 32, "eyes": 1, "english_text": 103, "pajamas": 120, "looking_down_at_another": 69}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "green_eyes": 1, "blonde_hair": 1, "half-closed_eyes": 10, "purple_hair": 1, "makeup": 1, "bedroom": 1, "sleeping": 1, "bedding": 7, "half-length_portrait": 7, "bed_sheet": 5, "blanket": 7, "mascara": 9, "lying_on_bed": 4, "text_box": 8, "purple_theme": 10, "vase": 8, "resting": 1, "annoyed_expression": 1, "expressions": 3, "head_on_pillow": 8, "sleeping_together": 2, "sleepover": 1, "under_blanket": 8, "yellow_eyeshadow": 6, "letters": 9, "applying_makeup": 4, "relaxed_expression": 6, "eyes": 1, "english_text": 4, "pajamas": 3, "looking_down_at_another": 5}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "applying_makeup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.473}, "bed_sheet": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3993}, "bedding": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3909}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "blanket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4205}, "clothing": {"source": "implied"}, "english_text": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4189}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5439}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8951}, "half-closed_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3951}, "half-length_portrait": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4197}, "head_on_pillow": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3887}, "humanoid": {"source": "structural"}, "letters": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3656}, "looking_down_at_another": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4491}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4093}, "mascara": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4462}, "narrowed_eyes": {"source": "implied"}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4086}, "personal_grooming": {"source": "implied"}, "portrait": {"source": "implied"}, "purple_theme": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4555}, "relaxed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5056}, "resting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5034}, "sleeping_together": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5084}, "sleepover": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5269}, "text_box": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3916}, "under_blanket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4281}, "vase": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3521}, "yellow_eyeshadow": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4551}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo", "text"], "t1": 1.35, "t2": 1.92, "t3": 31.07, "t3s": 0.86, "t3p": 2.75, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=211 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 125, "n_selected": 58, "n_implied": 20, "n_structural": 4, "n_probe": 3, "ret_R": 0.4545, "P": 0.2931, "R": 0.7727, "F1": 0.425, "leaf_P": 0.1613, "leaf_R": 0.3846, "leaf_F1": 0.2273, "n_leaf_sel": 31, "n_leaf_gt": 13, "ret_P": 0.08, "sel_given_ret": 1.7, "over_sel": 2.64, "why": {"explicit": 34, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 61, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "11": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2931, "gen_R": 0.7727, "gen_F1": 0.425, "missed": ["fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["abyssal_wolf", "acoustic_guitar", "alpha_channel", "annoyed_expression", "auburn_hair", "big_claws", "big_tail", "blonde_hair", "bottomwear", "canis", "claws_out", "curled_hair", "denim", "denim_clothing", "digitigrade", "electric_guitar", "finger_claws", "flowing_hair", "holding_guitar", "jeans", "leggings", "legwear", "long_claws", "long_tail", "looking_at_viewer", "pants", "playing_guitar", "playing_music", "pointed_tail", "shirt", "shorts", "topwear", "torn_bottomwear", "torn_jeans", "torn_leggings", "torn_legwear", "torn_pants", "torn_shirt", "torn_shorts", "torn_topwear", "wavy_hair"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["abyssal_wolf", "acoustic_guitar", "alpha_channel", "annoyed_expression", "anthro", "auburn_hair", "bass_guitar", "big_claws", "big_tail", "blonde_hair", "bottomwear", "canid", "canine", "canis", "claws", "claws_out", "clothed", "clothing", "curled_hair", "denim", "denim_clothing", "digitigrade", "electric_guitar", "finger_claws", "flowing_hair", "guitar", "hair", "holding_guitar", "jeans", "leggings", "legwear", "long_claws", "long_tail", "looking_at_viewer", "mammal", "musical_instrument", "pants", "playing_guitar", "playing_music", "plucked_string_instrument", "pointed_tail", "shirt", "shorts", "solo", "spade_tail", "string_instrument", "tail", "topwear", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_leggings", "torn_legwear", "torn_pants", "torn_shirt", "torn_shorts", "torn_topwear", "wavy_hair"], "stage3_selected": ["abyssal_wolf", "acoustic_guitar", "annoyed_expression", "auburn_hair", "bass_guitar", "big_claws", "big_tail", "blonde_hair", "canis", "claws", "claws_out", "curled_hair", "digitigrade", "electric_guitar", "finger_claws", "flowing_hair", "hair", "holding_guitar", "long_claws", "long_tail", "playing_guitar", "plucked_string_instrument", "pointed_tail", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_leggings", "torn_shirt", "torn_shorts", "torn_topwear", "transparent_background", "wavy_hair"], "stage3_selected_scores": {"hair": 0.5731, "tail": 0.5659, "canis": 0.4098, "claws": 0.5684, "blonde_hair": 0.382, "digitigrade": 0.4195, "torn_clothing": 0.4133, "long_tail": 0.4222, "finger_claws": 0.4395, "big_tail": 0.3841, "spade_tail": 0.618, "string_instrument": 0.8617, "torn_bottomwear": 0.4362, "plucked_string_instrument": 0.8658, "curled_hair": 0.3875, "torn_topwear": 0.3945, "wavy_hair": 0.3492, "torn_shirt": 0.4049, "long_claws": 0.4365, "playing_guitar": 0.9317, "torn_jeans": 0.4824, "claws_out": 0.438, "big_claws": 0.4299, "annoyed_expression": 0.4693, "electric_guitar": 0.8664, "pointed_tail": 0.3768, "torn_shorts": 0.3996, "auburn_hair": 0.3767, "bass_guitar": 0.9118, "flowing_hair": 0.5669, "abyssal_wolf": 0.4098, "holding_guitar": 0.8442, "torn_leggings": 0.4244, "acoustic_guitar": 0.8654, "transparent_background": 0.4526}, "stage3_selected_ranks": {"hair": 16, "tail": 19, "canis": 90, "claws": 17, "blonde_hair": 112, "digitigrade": 84, "torn_clothing": 87, "long_tail": 83, "finger_claws": 66, "big_tail": 110, "spade_tail": 12, "string_instrument": 8, "torn_bottomwear": 71, "plucked_string_instrument": 6, "curled_hair": 107, "torn_topwear": 99, "wavy_hair": 130, "torn_shirt": 91, "long_claws": 70, "playing_guitar": 2, "torn_jeans": 39, "claws_out": 68, "big_claws": 79, "annoyed_expression": 45, "electric_guitar": 5, "pointed_tail": 116, "torn_shorts": 95, "auburn_hair": 117, "bass_guitar": 3, "flowing_hair": 18, "abyssal_wolf": 89, "holding_guitar": 11, "torn_leggings": 82, "acoustic_guitar": 7, "transparent_background": 59}, "stage3_selected_phrase_ranks": {"hair": 1, "tail": 1, "canis": 10, "claws": 1, "blonde_hair": 6, "digitigrade": 10, "torn_clothing": 6, "long_tail": 5, "finger_claws": 5, "big_tail": 9, "spade_tail": 1, "string_instrument": 7, "torn_bottomwear": 3, "plucked_string_instrument": 5, "curled_hair": 5, "torn_topwear": 10, "wavy_hair": 9, "torn_shirt": 7, "long_claws": 8, "playing_guitar": 1, "torn_jeans": 1, "claws_out": 6, "big_claws": 9, "annoyed_expression": 10, "electric_guitar": 4, "pointed_tail": 5, "torn_shorts": 8, "auburn_hair": 7, "bass_guitar": 2, "flowing_hair": 1, "abyssal_wolf": 9, "holding_guitar": 10, "torn_leggings": 5, "acoustic_guitar": 5, "transparent_background": 9}, "extra_evidence": {"abyssal_wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4098}, "acoustic_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8654}, "alpha_channel": {"source": "implied"}, "annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4693}, "auburn_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3767}, "big_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4299}, "big_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3841}, "blonde_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.382}, "bottomwear": {"source": "implied"}, "canis": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4098}, "claws_out": {"source": "stage3", "why": "explicit", "retrieval_score": 0.438}, "curled_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3875}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "digitigrade": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4195}, "electric_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8664}, "finger_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4395}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5669}, "holding_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8442}, "jeans": {"source": "implied"}, "leggings": {"source": "implied"}, "legwear": {"source": "implied"}, "long_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4365}, "long_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4222}, "looking_at_viewer": {"source": "structural"}, "pants": {"source": "implied"}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9317}, "playing_music": {"source": "implied"}, "pointed_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3768}, "shirt": {"source": "implied"}, "shorts": {"source": "implied"}, "topwear": {"source": "implied"}, "torn_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4362}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4824}, "torn_leggings": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4244}, "torn_legwear": {"source": "implied"}, "torn_pants": {"source": "implied"}, "torn_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4049}, "torn_shorts": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3996}, "torn_topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3945}, "wavy_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3492}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 1.43, "t2": 1.23, "t3": 40.57, "t3s": 1.09, "t3p": 3.22, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=131 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 169, "n_selected": 42, "n_implied": 16, "n_structural": 4, "n_probe": 4, "ret_R": 0.64, "P": 0.5, "R": 0.84, "F1": 0.6269, "leaf_P": 0.3684, "leaf_R": 0.4667, "leaf_F1": 0.4118, "n_leaf_sel": 19, "n_leaf_gt": 15, "ret_P": 0.0947, "sel_given_ret": 1.3125, "over_sel": 1.68, "why": {"explicit": 22}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 92, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 2, "parse_fail": 0, "errors": 1}, "49": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.25, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.84, "gen_F1": 0.6269, "missed": ["crossed_arms", "looking_at_another", "overalls", "standing"], "extra": ["3_claws", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "dress_shirt", "fluffy_fur", "grey_clothing", "grey_shirt", "grey_topwear", "long_ears", "looking_at_viewer", "open_mouth", "t-shirt", "toe_claws", "white_body", "white_clothing", "white_fur", "white_shirt", "white_t-shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["3_claws", "anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "dress_shirt", "duo", "facial_markings", "fluffy_fur", "fox", "fur", "grey_background", "grey_clothing", "grey_shirt", "grey_topwear", "head_markings", "lagomorph", "leporid", "long_ears", "looking_at_viewer", "mammal", "markings", "open_mouth", "pants", "rabbit", "shirt", "t-shirt", "toe_claws", "topwear", "white_body", "white_clothing", "white_fur", "white_shirt", "white_t-shirt", "white_topwear"], "stage3_selected": ["3_claws", "black_bottomwear", "black_pants", "blue_clothing", "claws", "dress_shirt", "facial_markings", "fluffy_fur", "fox", "fur", "grey_background", "grey_shirt", "long_ears", "open_mouth", "rabbit", "simple_background", "toe_claws", "topwear", "white_fur", "white_shirt", "white_t-shirt", "white_topwear"], "stage3_selected_scores": {"fur": 0.6531, "simple_background": 0.416, "open_mouth": 0.633, "claws": 0.6303, "white_fur": 0.5149, "topwear": 0.6439, "fox": 0.638, "rabbit": 0.6511, "toe_claws": 0.5549, "grey_background": 0.6784, "long_ears": 0.4628, "facial_markings": 0.6945, "blue_clothing": 0.6538, "white_topwear": 0.7671, "black_bottomwear": 0.7384, "white_shirt": 0.8197, "dress_shirt": 0.6688, "black_pants": 0.833, "grey_shirt": 0.6923, "3_claws": 0.5531, "white_t-shirt": 0.7504, "fluffy_fur": 0.4964}, "stage3_selected_ranks": {"fur": 40, "simple_background": 171, "open_mouth": 48, "claws": 50, "white_fur": 132, "topwear": 43, "fox": 46, "rabbit": 41, "toe_claws": 95, "grey_background": 30, "long_ears": 155, "facial_markings": 27, "blue_clothing": 39, "white_topwear": 6, "black_bottomwear": 19, "white_shirt": 4, "dress_shirt": 32, "black_pants": 3, "grey_shirt": 28, "3_claws": 96, "white_t-shirt": 15, "fluffy_fur": 139}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 8, "open_mouth": 1, "claws": 1, "white_fur": 4, "topwear": 7, "fox": 1, "rabbit": 1, "toe_claws": 4, "grey_background": 1, "long_ears": 10, "facial_markings": 1, "blue_clothing": 8, "white_topwear": 3, "black_bottomwear": 5, "white_shirt": 1, "dress_shirt": 5, "black_pants": 1, "grey_shirt": 4, "3_claws": 5, "white_t-shirt": 10, "fluffy_fur": 5}, "extra_evidence": {"3_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5531}, "black_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7384}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.833}, "blue_clothing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6538}, "dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6688}, "fluffy_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4964}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6923}, "grey_topwear": {"source": "implied"}, "long_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4628}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.633}, "t-shirt": {"source": "implied"}, "toe_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5549}, "white_body": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5149}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8197}, "white_t-shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7504}, "white_topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7671}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "clothing", "duo"], "t1": 1.6, "t2": 1.58, "t3": 53.13, "t3s": 2.35, "t3p": 1.07, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=169 entity=5 copyright_filtered=0 generic_char_to_general=0 unknown_type=3", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"style_or_meta\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"style_or_meta\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"explicit\"}, {\"i\": 29, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 35, \"why\": \"style_or_meta\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 38, \"why\": \"other\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 42, \"why\": \"style_or_meta\"}, {\"i\": 43, \"why\": \"style_or_meta\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"explicit\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"explicit\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 51}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 51}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 202, "n_selected": 60, "n_implied": 18, "n_structural": 5, "n_probe": 3, "ret_R": 0.6364, "P": 0.1833, "R": 1.0, "F1": 0.3099, "leaf_P": 0.1538, "leaf_R": 0.8571, "leaf_F1": 0.2609, "n_leaf_sel": 39, "n_leaf_gt": 7, "ret_P": 0.0347, "sel_given_ret": 1.5714, "over_sel": 5.45, "why": {"explicit": 36, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 4, "calls_with_selection": 4, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 105, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}, "26": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1897, "gen_R": 1.0, "gen_F1": 0.3188, "missed": [], "extra": ["abs", "animal_humanoid", "animated", "animated_png", "anthro", "blue_inner_ear_fluff", "blue_pawpads", "blue_paws", "blue_tail", "blue_tuft", "blush", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "dog_humanoid", "generation_2_pokemon", "glistening", "glistening_nose", "grey_nose", "heterochromia", "humanoid", "inner_ear_fluff", "intersex", "jumping", "jumpluff", "looking_at_viewer", "mammal_humanoid", "muscular", "nintendo", "pawpads", "pink_mouth", "pink_stripes", "pokemon", "pokemon_(species)", "posed", "sparkling_character", "stripes", "swinging", "tail", "teeth", "thong_straps", "tuft", "two_tone_tail", "white_inner_ear_fluff", "white_nose", "white_stripes", "white_tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["abs", "animal_humanoid", "animated", "animated_png", "anthro", "blue_eyes", "blue_inner_ear_fluff", "blue_nose", "blue_pawpads", "blue_paws", "blue_tail", "blue_tuft", "blush", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "dog_humanoid", "fur", "generation_2_pokemon", "glistening", "glistening_nose", "grey_nose", "heterochromia", "humanoid", "inner_ear_fluff", "intersex", "jumping", "jumpluff", "looking_at_viewer", "mammal", "mammal_humanoid", "muscular", "nintendo", "open_mouth", "pawpads", "pink_mouth", "pink_stripes", "pokemon", "pokemon_(species)", "posed", "purple_body", "solo", "sparkling_character", "stripes", "swinging", "tail", "teeth", "thong_straps", "tuft", "two_tone_tail", "white_body", "white_fur", "white_inner_ear_fluff", "white_nose", "white_stripes", "white_tail"], "stage3_selected": ["abs", "animated", "animated_png", "blue_eyes", "blue_inner_ear_fluff", "blue_nose", "blue_pawpads", "blue_paws", "blue_tail", "blue_tuft", "blush", "canine_humanoid", "curved_tail", "dog_humanoid", "glistening_nose", "grey_nose", "heterochromia", "jumping", "jumpluff", "muscular", "open_mouth", "pink_mouth", "pink_stripes", "posed", "purple_body", "simple_background", "sparkling_character", "stripes", "swinging", "teeth", "thong_straps", "two_tone_tail", "white_fur", "white_inner_ear_fluff", "white_nose", "white_stripes", "white_tail"], "stage3_selected_scores": {"simple_background": 0.5831, "blush": 0.3631, "open_mouth": 0.5904, "teeth": 0.3451, "blue_eyes": 0.5873, "white_fur": 0.5819, "muscular": 0.3492, "abs": 0.3151, "stripes": 0.583, "purple_body": 0.5531, "canine_humanoid": 0.9138, "white_tail": 0.4849, "heterochromia": 0.428, "two_tone_tail": 0.4834, "blue_nose": 0.5939, "blue_tail": 0.5092, "blue_pawpads": 0.4725, "white_inner_ear_fluff": 0.5683, "dog_humanoid": 0.8087, "grey_nose": 0.4311, "glistening_nose": 0.4258, "white_stripes": 0.534, "white_nose": 0.4882, "thong_straps": 0.3103, "blue_paws": 0.4669, "pink_stripes": 0.5472, "blue_inner_ear_fluff": 0.4647, "curved_tail": 0.5999, "pink_mouth": 0.4798, "swinging": 0.338, "sparkling_character": 0.3409, "blue_tuft": 0.4615, "jumpluff": 0.3484, "posed": 0.4332, "animated_png": 0.4499, "animated": 0.3743, "jumping": 0.5806}, "stage3_selected_ranks": {"simple_background": 23, "blush": 175, "open_mouth": 21, "teeth": 186, "blue_eyes": 22, "white_fur": 25, "muscular": 181, "abs": 201, "stripes": 24, "purple_body": 38, "canine_humanoid": 1, "white_tail": 83, "heterochromia": 143, "two_tone_tail": 89, "blue_nose": 19, "blue_tail": 67, "blue_pawpads": 100, "white_inner_ear_fluff": 29, "dog_humanoid": 5, "grey_nose": 139, "glistening_nose": 144, "white_stripes": 52, "white_nose": 80, "thong_straps": 203, "blue_paws": 104, "pink_stripes": 42, "blue_inner_ear_fluff": 108, "curved_tail": 16, "pink_mouth": 93, "swinging": 192, "sparkling_character": 191, "blue_tuft": 112, "jumpluff": 182, "posed": 137, "animated_png": 123, "animated": 170, "jumping": 26}, "stage3_selected_phrase_ranks": {"simple_background": 1, "blush": 4, "open_mouth": 1, "teeth": 9, "blue_eyes": 1, "white_fur": 1, "muscular": 6, "abs": 7, "stripes": 1, "purple_body": 1, "canine_humanoid": 1, "white_tail": 7, "heterochromia": 6, "two_tone_tail": 9, "blue_nose": 1, "blue_tail": 2, "blue_pawpads": 6, "white_inner_ear_fluff": 2, "dog_humanoid": 5, "grey_nose": 7, "glistening_nose": 9, "white_stripes": 4, "white_nose": 3, "thong_straps": 7, "blue_paws": 8, "pink_stripes": 1, "blue_inner_ear_fluff": 5, "curved_tail": 1, "pink_mouth": 6, "swinging": 9, "sparkling_character": 8, "blue_tuft": 10, "jumpluff": 7, "posed": 8, "animated_png": 1, "animated": 4, "jumping": 1}, "extra_evidence": {"abs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3151}, "animal_humanoid": {"source": "implied"}, "animated": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3743}, "animated_png": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4499}, "anthro": {"source": "structural"}, "blue_inner_ear_fluff": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4647}, "blue_pawpads": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4725}, "blue_paws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4669}, "blue_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5092}, "blue_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4615}, "blush": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3631}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9138}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5999}, "dog_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8087}, "generation_2_pokemon": {"source": "implied"}, "glistening": {"source": "implied"}, "glistening_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4258}, "grey_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4311}, "heterochromia": {"source": "stage3", "why": "explicit", "retrieval_score": 0.428}, "humanoid": {"source": "implied"}, "inner_ear_fluff": {"source": "implied"}, "intersex": {"source": "structural"}, "jumping": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5806}, "jumpluff": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3484}, "looking_at_viewer": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "muscular": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3492}, "nintendo": {"source": "implied"}, "pawpads": {"source": "implied"}, "pink_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4798}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5472}, "pokemon": {"source": "implied"}, "pokemon_(species)": {"source": "implied"}, "posed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4332}, "sparkling_character": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3409}, "stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.583}, "swinging": {"source": "stage3", "why": "explicit", "retrieval_score": 0.338}, "tail": {"source": "implied"}, "teeth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3451}, "thong_straps": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3103}, "tuft": {"source": "implied"}, "two_tone_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4834}, "white_inner_ear_fluff": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5683}, "white_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4882}, "white_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.534}, "white_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4849}}, "structural": ["solo", "anthro", "intersex", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 2.77, "t2": 1.81, "t3": 47.96, "t3s": 1.23, "t3p": 6.5, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=206 entity=2 copyright_filtered=1 generic_char_to_general=4 unknown_type=12"]}
data/eval_results/latency_chunk100_seed42.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T05:55:26.334510", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 100, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 10, "temperature": 0.0, "shuffle": false, "seed": 42, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 45}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 153, "n_selected": 38, "n_implied": 20, "n_structural": 4, "n_probe": 5, "ret_R": 0.3333, "P": 0.2105, "R": 0.6667, "F1": 0.32, "leaf_P": 0.2353, "leaf_R": 0.4444, "leaf_F1": 0.3077, "n_leaf_sel": 17, "n_leaf_gt": 9, "ret_P": 0.0261, "sel_given_ret": 2.0, "over_sel": 3.17, "why": {"explicit": 13}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 54, "attempts_by_n_local": {"100": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "56": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2105, "gen_R": 0.6667, "gen_F1": 0.32, "missed": ["alpha_channel", "feline", "fingers", "hair"], "extra": ["beverage", "black_body", "black_fur", "business_suit", "container", "cup", "formal", "grey_clothing", "grey_shirt", "grey_topwear", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "mug", "necktie", "shirt", "suit", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_necktie", "white_shirt", "white_t-shirt", "white_topwear"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beverage", "black_body", "black_fur", "business_suit", "clothed", "clothing", "container", "cup", "felid", "formal", "fur", "grey_clothing", "grey_shirt", "grey_topwear", "holding_beverage", "holding_container", "holding_cup", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "shirt", "solo", "suit", "t-shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_clothing", "white_necktie", "white_shirt", "white_t-shirt", "white_topwear"], "stage3_selected": ["anthro", "black_fur", "business_suit", "formal", "grey_shirt", "holding_beverage", "holding_cup", "holding_mug", "simple_background", "teal_shirt", "vest", "white_necktie", "white_t-shirt"], "stage3_selected_scores": {"anthro": 0.4929, "simple_background": 0.6978, "black_fur": 0.7183, "vest": 0.8403, "holding_cup": 0.7667, "holding_beverage": 0.7721, "grey_shirt": 0.7582, "business_suit": 0.5746, "holding_mug": 0.916, "white_t-shirt": 0.7329, "formal": 0.5993, "teal_shirt": 0.7474, "white_necktie": 0.6418}, "stage3_selected_ranks": {"anthro": 135, "simple_background": 50, "black_fur": 35, "vest": 3, "holding_cup": 10, "holding_beverage": 8, "grey_shirt": 13, "business_suit": 97, "holding_mug": 1, "white_t-shirt": 26, "formal": 84, "teal_shirt": 19, "white_necktie": 66}, "stage3_selected_phrase_ranks": {"anthro": 9, "simple_background": 1, "black_fur": 1, "vest": 1, "holding_cup": 4, "holding_beverage": 3, "grey_shirt": 1, "business_suit": 8, "holding_mug": 1, "white_t-shirt": 3, "formal": 1, "teal_shirt": 4, "white_necktie": 10}, "extra_evidence": {"beverage": {"source": "implied"}, "black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7183}, "business_suit": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5746}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5993}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "holding_beverage": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7721}, "holding_container": {"source": "implied"}, "holding_cup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7667}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "implied"}, "necktie": {"source": "implied"}, "shirt": {"source": "implied"}, "suit": {"source": "implied"}, "t-shirt": {"source": "implied"}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8403}, "white_clothing": {"source": "implied"}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6418}, "white_shirt": {"source": "implied"}, "white_t-shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7329}, "white_topwear": {"source": "implied"}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 3.14, "t2": 7.11, "t3": 95.33, "t3s": 5.66, "t3p": 3.88, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=156 entity=1 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 161, "n_selected": 24, "n_implied": 4, "n_structural": 5, "n_probe": 6, "ret_R": 0.5714, "P": 0.4167, "R": 0.7143, "F1": 0.5263, "leaf_P": 0.2, "leaf_R": 0.3, "leaf_F1": 0.24, "n_leaf_sel": 15, "n_leaf_gt": 10, "ret_P": 0.0497, "sel_given_ret": 1.25, "over_sel": 1.71, "why": {"explicit": 8, "strong_implied": 5}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 4, "attempt_errors": 2, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 42, "attempts_by_n_local": {"100": {"attempts": 3, "parse_ok": 1, "parse_fail": 0, "errors": 2}, "62": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.5, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4167, "gen_R": 0.7143, "gen_F1": 0.5263, "missed": ["ape", "fur", "hair", "human"], "extra": ["anthro", "bottomwear", "chasing", "cloth", "duo", "flash", "interactive", "laugh", "loincloth", "monkey", "raised_arms", "topless", "trio", "ursine"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "bear", "bottomwear", "chasing", "cloth", "clothed", "clothing", "dancing", "duo", "flash", "group", "haplorhine", "interactive", "laugh", "loincloth", "looking_at_viewer", "male", "mammal", "monkey", "primate", "raised_arms", "topless", "trio", "ursine"], "stage3_selected": ["bear", "chasing", "cloth", "dancing", "interactive", "laugh", "loincloth", "looking_at_viewer", "male", "monkey", "primate", "raised_arms", "ursine"], "stage3_selected_scores": {"looking_at_viewer": 0.5455, "bear": 0.5731, "primate": 0.89, "ursine": 0.4377, "loincloth": 0.5677, "monkey": 0.7553, "raised_arms": 0.5437, "cloth": 0.325, "male": 0.5579, "dancing": 0.5556, "laugh": 0.5253, "chasing": 0.3326, "interactive": 0.4063}, "stage3_selected_ranks": {"looking_at_viewer": 18, "bear": 12, "primate": 2, "ursine": 45, "loincloth": 13, "monkey": 6, "raised_arms": 19, "cloth": 155, "male": 15, "dancing": 16, "laugh": 22, "chasing": 148, "interactive": 73}, "stage3_selected_phrase_ranks": {"looking_at_viewer": 1, "bear": 1, "primate": 1, "ursine": 8, "loincloth": 1, "monkey": 6, "raised_arms": 1, "cloth": 9, "male": 1, "dancing": 1, "laugh": 1, "chasing": 9, "interactive": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "bottomwear": {"source": "implied"}, "chasing": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.3326}, "cloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.325}, "duo": {"source": "probe"}, "flash": {"source": "implied"}, "interactive": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4063}, "laugh": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5253}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5677}, "monkey": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7553}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5437}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "ursine": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4377}}, "structural": ["trio", "anthro", "male", "clothed", "topless"], "probe": ["clothing", "simple_background", "anthro", "duo", "group", "bear"], "t1": 2.5, "t2": 5.44, "t3": 54.43, "t3s": 4.59, "t3p": 8.2, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=162 entity=5 copyright_filtered=2 generic_char_to_general=1 unknown_type=3", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 40, \"why\": \"style_or_meta\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"style_or_meta\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"explicit\"}, {\"i\": 71, \"why\": \"style_or_meta\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 79, \"why\": \"other\"}, {\"i\": 81, \"why\": \"weak_implied\"}, {\"i\": 83, \"why\": \"style_or_meta\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.35.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.35.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"other\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"explicit\"}, {\"i\": 71, \"why\": \"other\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 79, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 79, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 149, "n_selected": 17, "n_implied": 2, "n_structural": 3, "n_probe": 5, "ret_R": 0.7143, "P": 0.2941, "R": 0.3571, "F1": 0.3226, "leaf_P": 0.2857, "leaf_R": 0.4444, "leaf_F1": 0.3478, "n_leaf_sel": 14, "n_leaf_gt": 9, "ret_P": 0.0671, "sel_given_ret": 0.5, "over_sel": 1.21, "why": {"explicit": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 1, "calls_exhausted_retries": 1, "attempts_total": 4, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"100": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}, "46": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.75, "call_exhaustion_rate": 0.5}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2941, "gen_R": 0.3571, "gen_F1": 0.3226, "missed": ["blue_eyes", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "extra": ["<3", "blue_hair", "blushing_profusely", "confident", "fur_coat", "hair", "overcoat", "relaxed_expression", "teal_clothing", "topwear", "vest", "winter_coat"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_hair", "blush", "blushing_profusely", "clothed", "clothing", "confident", "duo", "fur_coat", "hair", "overcoat", "relaxed_expression", "teal_clothing", "topwear", "vest", "winter_coat"], "stage3_selected": ["anthro", "blue_hair", "blushing_profusely", "confident", "fur_coat", "overcoat", "relaxed_expression", "teal_clothing", "vest", "winter_coat"], "stage3_selected_scores": {"anthro": 0.4155, "blue_hair": 0.4164, "vest": 0.4931, "blushing_profusely": 0.4371, "confident": 0.5026, "fur_coat": 0.4929, "winter_coat": 0.4685, "teal_clothing": 0.4159, "overcoat": 0.4981, "relaxed_expression": 0.5032}, "stage3_selected_ranks": {"anthro": 119, "blue_hair": 115, "vest": 57, "blushing_profusely": 94, "confident": 46, "fur_coat": 58, "winter_coat": 71, "teal_clothing": 117, "overcoat": 52, "relaxed_expression": 44}, "stage3_selected_phrase_ranks": {"anthro": 8, "blue_hair": 7, "vest": 8, "blushing_profusely": 4, "confident": 5, "fur_coat": 9, "winter_coat": 10, "teal_clothing": 9, "overcoat": 6, "relaxed_expression": 1}, "extra_evidence": {"<3": {"source": "probe"}, "blue_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4164}, "blushing_profusely": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4371}, "confident": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5026}, "fur_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4929}, "hair": {"source": "implied"}, "overcoat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4981}, "relaxed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5032}, "teal_clothing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4159}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4931}, "winter_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4685}}, "structural": ["duo", "anthro", "clothed"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 3.51, "t2": 7.03, "t3": 55.27, "t3s": 5.85, "t3p": 4.92, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=146 entity=3 copyright_filtered=1 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"explicit\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 66}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 66}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"strong_implied\"}, {\"i\": 12, \"why\": \"strong_implied\"}, {\"i\": 68, \"why\": \"other\"}, {\"i\": 27, \"why\": \"style_or_meta\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 1, \"why\": \"other\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 6, \"why\": \"other\"}, {\"i\": 9, \"why\": \"other\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 19, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"other\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 41, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 44, \"why\": \"other\"}, {\"i\": 47, \"why\": \"other\"}, {\"i\": 48, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.37.why\n Field required [type=missing, input_value={'i': 48, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"explicit\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 63}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 63}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: gave up after 3 attempts"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 111, "n_selected": 13, "n_implied": 1, "n_structural": 5, "n_probe": 3, "ret_R": 0.75, "P": 0.1538, "R": 0.5, "F1": 0.2353, "leaf_P": 0.1667, "leaf_R": 0.5, "leaf_F1": 0.25, "n_leaf_sel": 12, "n_leaf_gt": 4, "ret_P": 0.027, "sel_given_ret": 0.6667, "over_sel": 3.25, "why": {"explicit": 6, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 27, "attempts_by_n_local": {"100": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "12": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1538, "gen_R": 0.5, "gen_F1": 0.2353, "missed": ["smile", "tan_body"], "extra": ["ambiguous_gender", "anthro", "big_eyes", "feral", "floating", "looking_at_viewer", "nose", "nude", "red_spots", "spots", "toony"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "big_eyes", "feral", "floating", "looking_at_viewer", "nose", "nude", "red_nose", "red_spots", "solo", "spots", "toony"], "stage3_selected": ["big_eyes", "floating", "nose", "red_nose", "red_spots", "toony", "white_background"], "stage3_selected_scores": {"white_background": 0.6138, "toony": 0.6021, "red_nose": 0.7501, "floating": 0.6519, "red_spots": 0.6068, "nose": 0.8607, "big_eyes": 0.6961}, "stage3_selected_ranks": {"white_background": 31, "toony": 35, "red_nose": 3, "floating": 16, "red_spots": 34, "nose": 2, "big_eyes": 6}, "stage3_selected_phrase_ranks": {"white_background": 1, "toony": 1, "red_nose": 1, "floating": 1, "red_spots": 10, "nose": 1, "big_eyes": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6961}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6519}, "looking_at_viewer": {"source": "structural"}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8607}, "nude": {"source": "structural"}, "red_spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6068}, "spots": {"source": "implied"}, "toony": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6021}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["simple_background", "anthro", "solo"], "t1": 2.03, "t2": 5.34, "t3": 22.63, "t3s": 3.79, "t3p": 7.0, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=112 entity=0 copyright_filtered=4 generic_char_to_general=0 unknown_type=5"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 192, "n_selected": 29, "n_implied": 9, "n_structural": 4, "n_probe": 3, "ret_R": 0.4545, "P": 0.4483, "R": 0.5909, "F1": 0.5098, "leaf_P": 0.1579, "leaf_R": 0.25, "leaf_F1": 0.1935, "n_leaf_sel": 19, "n_leaf_gt": 12, "ret_P": 0.0521, "sel_given_ret": 1.3, "over_sel": 1.32, "why": {"explicit": 15}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 63, "attempts_by_n_local": {"100": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "88": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.3333333333333333, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4483, "gen_R": 0.5909, "gen_F1": 0.5098, "missed": ["chest_tuft", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["backlighting", "belly", "confident", "countershade_belly", "countershade_body", "eyes", "flexing_both_biceps", "flexing_muscles", "gesture", "light", "lighting", "raised_hand", "striped_body", "striped_fur", "warm_lighting", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "backlighting", "belly", "blue_eyes", "bottomwear", "clothed", "clothing", "confident", "countershade_belly", "countershade_body", "countershading", "eyes", "felid", "flexing_both_biceps", "flexing_muscles", "fur", "gesture", "light", "lighting", "male", "mammal", "raised_hand", "shorts", "solo", "striped_body", "striped_fur", "stripes", "warm_lighting", "white_chest"], "stage3_selected": ["backlighting", "blue_eyes", "confident", "countershade_belly", "countershade_body", "eyes", "flexing_both_biceps", "flexing_muscles", "gesture", "raised_hand", "shorts", "striped_body", "striped_fur", "warm_lighting", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5973, "shorts": 0.6091, "gesture": 0.6156, "striped_body": 0.6187, "striped_fur": 0.6688, "raised_hand": 0.7153, "backlighting": 0.5866, "confident": 0.5188, "white_chest": 0.9284, "countershade_body": 0.8712, "flexing_both_biceps": 0.5618, "countershade_belly": 0.835, "warm_lighting": 0.901, "flexing_muscles": 0.6008, "eyes": 0.9788}, "stage3_selected_ranks": {"blue_eyes": 75, "shorts": 70, "gesture": 64, "striped_body": 63, "striped_fur": 46, "raised_hand": 28, "backlighting": 77, "confident": 115, "white_chest": 2, "countershade_body": 4, "flexing_both_biceps": 84, "countershade_belly": 7, "warm_lighting": 3, "flexing_muscles": 72, "eyes": 1}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "shorts": 1, "gesture": 1, "striped_body": 1, "striped_fur": 2, "raised_hand": 1, "backlighting": 6, "confident": 1, "white_chest": 1, "countershade_body": 1, "flexing_both_biceps": 8, "countershade_belly": 2, "warm_lighting": 1, "flexing_muscles": 6, "eyes": 1}, "extra_evidence": {"backlighting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5866}, "belly": {"source": "implied"}, "confident": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5188}, "countershade_belly": {"source": "stage3", "why": "explicit", "retrieval_score": 0.835}, "countershade_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8712}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9788}, "flexing_both_biceps": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5618}, "flexing_muscles": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6008}, "gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6156}, "light": {"source": "implied"}, "lighting": {"source": "implied"}, "raised_hand": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7153}, "striped_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6187}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6688}, "warm_lighting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.901}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9284}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "felid", "solo"], "t1": 3.19, "t2": 1.89, "t3": 43.33, "t3s": 3.06, "t3p": 5.17, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=188 entity=2 copyright_filtered=3 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"style_or_meta\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"explicit\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"explicit\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"style_or_meta\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"explicit\"}, {\"i\": 67, \"why\": \"explicit\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"style_or_meta\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"weak_implied\"}, {\"i\": 74, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.35.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.35.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 202, "n_selected": 7, "n_implied": 0, "n_structural": 5, "n_probe": 4, "ret_R": 0.6923, "P": 0.1429, "R": 0.0769, "F1": 0.1, "leaf_P": 0.1667, "leaf_R": 0.1667, "leaf_F1": 0.1667, "n_leaf_sel": 6, "n_leaf_gt": 6, "ret_P": 0.0446, "sel_given_ret": 0.1111, "over_sel": 0.54, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 0, "calls_exhausted_retries": 2, "attempts_total": 6, "attempt_errors": 6, "attempt_parse_fail": 0, "attempt_parse_ok": 0, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"100": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}, "91": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}}, "attempt_failure_rate": 1.0, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1429, "gen_R": 0.0769, "gen_F1": 0.1, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothed", "clothing", "duo", "group", "solo"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothed", "clothing", "duo", "group", "solo", "text"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"anthro": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "duo": {"source": "structural"}, "group": {"source": "structural"}, "solo": {"source": "structural"}}, "structural": ["solo", "duo", "group", "clothed", "text"], "probe": ["clothing", "anthro", "text", "group"], "t1": 1.89, "t2": 2.2, "t3": 68.08, "t3s": 1.55, "t3p": 1.68, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=191 entity=5 copyright_filtered=6 generic_char_to_general=2 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 42, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 66, \"why\": \"weak_implied\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"weak_implied\"}, {\"i\": 74, \"why\": \"weak_implied\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"weak_implied\"}, {\"i\": 84}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 84}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"style_or_meta\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 66, \"why\": \"weak_implied\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"weak_implied\"}, {\"i\": 74, \"why\": \"weak_implied\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 78, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"weak_implied\"}, {\"i\": 84}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.37.why\n Field required [type=missing, input_value={'i': 84}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 66, \"why\": \"weak_implied\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"weak_implied\"}, {\"i\": 74, \"why\": \"weak_implied\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"weak_implied\"}, {\"i\": 84, \"why\": \"weak_implied\"}, {\"i\": 86, \"why\": \"weak_implied\"}, {\"i\": 88, \"why\": \"weak_implied\"}, {\"i\": 90, \"why\": \"weak_implied\"}, {\"i\": 92, \"why\": \"weak_implied\"}, {\"i\": 94, \"why\": \"weak_implied\"}, {\"i\": 96, \"why\": \"weak_implied\"}, {\"i\": 98, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.34.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.34.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: gave up after 3 attempts", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 79, \"why\": \"weak_implied\"}, {\"i\": 81, \"why\": \"weak_implied\"}, {\"i\": 83}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 83}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 79, \"why\": \"weak_implied\"}, {\"i\": 81}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 81}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 79, \"why\": \"weak_implied\"}, {\"i\": 81}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 81}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: gave up after 3 attempts"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 191, "n_selected": 13, "n_implied": 3, "n_structural": 4, "n_probe": 4, "ret_R": 0.6667, "P": 0.3077, "R": 0.2667, "F1": 0.2857, "leaf_P": 0.3, "leaf_R": 0.25, "leaf_F1": 0.2727, "n_leaf_sel": 10, "n_leaf_gt": 12, "ret_P": 0.0524, "sel_given_ret": 0.4, "over_sel": 0.87, "why": {"explicit": 4}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 1, "calls_exhausted_retries": 1, "attempts_total": 4, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 36, "attempts_by_n_local": {"100": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}, "94": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.75, "call_exhaustion_rate": 0.5}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3077, "gen_R": 0.2667, "gen_F1": 0.2857, "missed": ["angry", "bed", "blonde_hair", "blue_eyes", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "purple_hair", "sleeping"], "extra": ["anthro", "bedding", "blush", "clothed", "clothing", "humanoid", "lipstick", "red_lipstick", "x_eyes"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["anthro", "bedding", "blush", "clothed", "clothing", "duo", "humanoid", "lipstick", "lying", "makeup", "red_lipstick", "text", "x_eyes"], "stage3_selected": ["bedding", "lying", "red_lipstick", "x_eyes"], "stage3_selected_scores": {"lying": 0.4413, "bedding": 0.3914, "red_lipstick": 0.4712, "x_eyes": 0.4222}, "stage3_selected_ranks": {"lying": 75, "bedding": 139, "red_lipstick": 53, "x_eyes": 91}, "stage3_selected_phrase_ranks": {"lying": 7, "bedding": 7, "red_lipstick": 5, "x_eyes": 5}, "extra_evidence": {"anthro": {"source": "probe"}, "bedding": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3914}, "blush": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "humanoid": {"source": "structural"}, "lipstick": {"source": "implied"}, "red_lipstick": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4712}, "x_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4222}}, "structural": ["duo", "humanoid", "clothed", "text"], "probe": ["simple_background", "anthro", "blush", "duo"], "t1": 2.63, "t2": 2.17, "t3": 66.86, "t3s": 1.42, "t3p": 0.61, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=194 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"other\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 10, \"why\": \"style_or_meta\"}, {\"i\": 12, \"why\": \"other\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"style_or_meta\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 44, \"why\": \"other\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 47, \"why\": \"style_or_meta\"}, {\"i\": 49, \"why\": \"explicit\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 52, \"why\": \"other\"}, {\"i\": 54, \"why\": \"other\"}, {\"i\": 56, \"why\": \"other\"}, {\"i\": 58, \"why\": \"other\"}, {\"i\": 60, \"why\": \"other\"}, {\"i\": 62, \"why\": \"other\"}, {\"i\": 64, \"why\": \"other\"}, {\"i\": 66, \"why\": \"other\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.38.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.38.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"other\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"style_or_meta\"}, {\"i\": 12, \"why\": \"other\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"style_or_meta\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"other\"}, {\"i\": 19, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 44, \"why\": \"other\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 47, \"why\": \"style_or_meta\"}, {\"i\": 49, \"why\": \"explicit\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 52, \"why\": \"other\"}, {\"i\": 54, \"why\": \"other\"}, {\"i\": 56, \"why\": \"other\"}, {\"i\": 58, \"why\": \"other\"}, {\"i\": 60, \"why\": \"other\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.38.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.38.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"other\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 10, \"why\": \"style_or_meta\"}, {\"i\": 12, \"why\": \"other\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"style_or_meta\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 44, \"why\": \"other\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 47, \"why\": \"style_or_meta\"}, {\"i\": 49, \"why\": \"explicit\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 52, \"why\": \"other\"}, {\"i\": 54, \"why\": \"other\"}, {\"i\": 56, \"why\": \"other\"}, {\"i\": 58, \"why\": \"other\"}, {\"i\": 60, \"why\": \"other\"}, {\"i\": 62, \"why\": \"other\"}, {\"i\": 64, \"why\": \"other\"}, {\"i\": 66, \"why\": \"style_or_meta\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.38.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.38.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: gave up after 3 attempts"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 177, "n_selected": 41, "n_implied": 16, "n_structural": 3, "n_probe": 3, "ret_R": 0.5909, "P": 0.439, "R": 0.8182, "F1": 0.5714, "leaf_P": 0.3889, "leaf_R": 0.5385, "leaf_F1": 0.4516, "n_leaf_sel": 18, "n_leaf_gt": 13, "ret_P": 0.0734, "sel_given_ret": 1.3846, "over_sel": 1.86, "why": {"explicit": 22}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 64, "attempts_by_n_local": {"100": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "77": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.3333333333333333, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.439, "gen_R": 0.8182, "gen_F1": 0.5714, "missed": ["fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["acoustic_guitar", "action_pose", "bottomwear", "canis", "denim", "denim_clothing", "domestic_dog", "flowing_hair", "holding_guitar", "hybrid", "jeans", "pants", "pastel_background", "playing_guitar", "playing_music", "pose", "posed", "spitz", "torn_bottomwear", "torn_jeans", "torn_pants", "wolf", "wolfdog"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["acoustic_guitar", "action_pose", "anthro", "bass_guitar", "bottomwear", "canid", "canine", "canis", "claws", "clothed", "clothing", "denim", "denim_clothing", "domestic_dog", "fingers", "flowing_hair", "guitar", "hair", "holding_guitar", "hybrid", "jeans", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "pose", "posed", "solo", "spade_tail", "spitz", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "wolf", "wolfdog"], "stage3_selected": ["acoustic_guitar", "action_pose", "bass_guitar", "canid", "claws", "fingers", "flowing_hair", "guitar", "holding_guitar", "pastel_background", "playing_guitar", "playing_music", "posed", "spade_tail", "spitz", "string_instrument", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "wolf", "wolfdog"], "stage3_selected_scores": {"canid": 0.422, "claws": 0.5637, "fingers": 0.4356, "wolf": 0.458, "torn_clothing": 0.4105, "spitz": 0.4401, "spade_tail": 0.6167, "string_instrument": 0.8611, "torn_bottomwear": 0.4339, "guitar": 0.9617, "action_pose": 0.5789, "torn_pants": 0.4619, "wolfdog": 0.4194, "playing_music": 0.8725, "playing_guitar": 0.9311, "torn_jeans": 0.481, "bass_guitar": 0.9112, "flowing_hair": 0.5655, "holding_guitar": 0.8441, "acoustic_guitar": 0.8647, "posed": 0.4462, "pastel_background": 0.56}, "stage3_selected_ranks": {"canid": 105, "claws": 20, "fingers": 82, "wolf": 60, "torn_clothing": 114, "spitz": 78, "spade_tail": 13, "string_instrument": 8, "torn_bottomwear": 84, "guitar": 1, "action_pose": 15, "torn_pants": 54, "wolfdog": 109, "playing_music": 4, "playing_guitar": 2, "torn_jeans": 43, "bass_guitar": 3, "flowing_hair": 19, "holding_guitar": 11, "acoustic_guitar": 7, "posed": 72, "pastel_background": 22}, "stage3_selected_phrase_ranks": {"canid": 7, "claws": 1, "fingers": 1, "wolf": 1, "torn_clothing": 6, "spitz": 7, "spade_tail": 1, "string_instrument": 7, "torn_bottomwear": 3, "guitar": 1, "action_pose": 1, "torn_pants": 2, "wolfdog": 8, "playing_music": 3, "playing_guitar": 1, "torn_jeans": 1, "bass_guitar": 2, "flowing_hair": 1, "holding_guitar": 10, "acoustic_guitar": 5, "posed": 7, "pastel_background": 1}, "extra_evidence": {"acoustic_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8647}, "action_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5789}, "bottomwear": {"source": "implied"}, "canis": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "domestic_dog": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5655}, "holding_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8441}, "hybrid": {"source": "implied"}, "jeans": {"source": "implied"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.56}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9311}, "playing_music": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8725}, "pose": {"source": "implied"}, "posed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4462}, "spitz": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4401}, "torn_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4339}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.481}, "torn_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4619}, "wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.458}, "wolfdog": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4194}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.63, "t2": 2.03, "t3": 31.06, "t3s": 0.91, "t3p": 1.4, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=177 entity=2 copyright_filtered=4 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"other\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 44, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"style_or_meta\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"explicit\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"explicit\"}, {\"i\": 62, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 62, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 161, "n_selected": 26, "n_implied": 15, "n_structural": 4, "n_probe": 3, "ret_R": 0.64, "P": 0.4615, "R": 0.48, "F1": 0.4706, "leaf_P": 0.4, "leaf_R": 0.2667, "leaf_F1": 0.32, "n_leaf_sel": 10, "n_leaf_gt": 15, "ret_P": 0.0994, "sel_given_ret": 0.75, "over_sel": 1.04, "why": {"explicit": 8}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 74, "attempts_by_n_local": {"100": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "61": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.3333333333333333, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4615, "gen_R": 0.48, "gen_F1": 0.4706, "missed": ["claws", "crossed_arms", "facial_markings", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "markings", "overalls", "rabbit", "standing"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "holding_object", "holding_tool", "looking_at_viewer", "tools", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_shirt", "blue_topwear", "bottomwear", "canid", "canine", "clothed", "clothing", "duo", "fox", "holding_object", "holding_tool", "looking_at_viewer", "mammal", "pants", "shirt", "tools", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "blue_overalls", "blue_shirt", "fox", "holding_tool", "shirt", "simple_background", "white_shirt"], "stage3_selected_scores": {"simple_background": 0.4161, "fox": 0.638, "shirt": 0.7484, "white_shirt": 0.8198, "black_pants": 0.8331, "blue_shirt": 0.7656, "holding_tool": 0.5163, "blue_overalls": 0.9203}, "stage3_selected_ranks": {"simple_background": 163, "fox": 47, "shirt": 18, "white_shirt": 4, "black_pants": 3, "blue_shirt": 7, "holding_tool": 124, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"simple_background": 8, "fox": 1, "shirt": 1, "white_shirt": 1, "black_pants": 1, "blue_shirt": 3, "holding_tool": 10, "blue_overalls": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8331}, "blue_clothing": {"source": "implied"}, "blue_overalls": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9203}, "blue_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7656}, "blue_topwear": {"source": "implied"}, "holding_object": {"source": "implied"}, "holding_tool": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5163}, "looking_at_viewer": {"source": "structural"}, "tools": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8198}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.36, "t2": 1.82, "t3": 49.39, "t3s": 1.59, "t3p": 5.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=161 entity=5 copyright_filtered=0 generic_char_to_general=0 unknown_type=3", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 44, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"style_or_meta\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 66, \"why\": \"weak_implied\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 70}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 70}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 183, "n_selected": 64, "n_implied": 27, "n_structural": 4, "n_probe": 3, "ret_R": 0.6364, "P": 0.0781, "R": 0.4545, "F1": 0.1333, "leaf_P": 0.0294, "leaf_R": 0.1429, "leaf_F1": 0.0488, "n_leaf_sel": 34, "n_leaf_gt": 7, "ret_P": 0.0383, "sel_given_ret": 0.7143, "over_sel": 5.82, "why": {"explicit": 32}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 1, "calls_exhausted_retries": 1, "attempts_total": 4, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 32, "attempts_by_n_local": {"100": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}, "86": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.75, "call_exhaustion_rate": 0.5}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.0781, "gen_R": 0.4545, "gen_F1": 0.1333, "missed": ["blue_eyes", "blue_nose", "open_mouth", "purple_body", "white_body", "white_fur"], "extra": ["amphibian", "amphibian_humanoid", "animal_humanoid", "anthro", "blinking", "blue_ears", "blue_pawpads", "blue_paws", "blue_stripes", "body_hair", "border", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "corpse", "countershading", "crosslegged_pose", "curled_up", "facial_markings", "fennec_humanoid", "fetal_pose", "fighting_pose", "fox_humanoid", "frog_humanoid", "glistening", "glistening_nose", "grey_nose", "head_markings", "heterochromia", "humanoid", "male", "male_humanoid", "mammal_humanoid", "markings", "mouth_full", "muscular", "muscular_male", "pawpads", "pink_body", "pink_countershading", "pose", "purple_border", "purple_tongue", "striped_face", "stripes", "swinging", "tail", "tail_tuft", "tan_body", "tan_fur", "tan_tail", "tanuki_humanoid", "tongue", "tuft", "two_tone_tail", "wavy_tail", "white_stripes", "white_tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["amphibian", "amphibian_humanoid", "animal_humanoid", "anthro", "blinking", "blue_ears", "blue_pawpads", "blue_paws", "blue_stripes", "body_hair", "border", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "corpse", "countershading", "crosslegged_pose", "curled_up", "facial_markings", "fennec_humanoid", "fetal_pose", "fighting_pose", "fox_humanoid", "frog_humanoid", "fur", "glistening", "glistening_nose", "grey_nose", "head_markings", "heterochromia", "humanoid", "male", "male_humanoid", "mammal", "mammal_humanoid", "markings", "mouth_full", "muscular", "muscular_male", "pawpads", "pink_body", "pink_countershading", "pose", "purple_border", "purple_tongue", "solo", "striped_face", "stripes", "swinging", "tail", "tail_tuft", "tan_body", "tan_fur", "tan_tail", "tanuki_humanoid", "tongue", "tuft", "two_tone_tail", "wavy_tail", "white_stripes", "white_tail"], "stage3_selected": ["blinking", "blue_ears", "blue_pawpads", "blue_paws", "blue_stripes", "body_hair", "corpse", "crosslegged_pose", "fennec_humanoid", "fetal_pose", "fighting_pose", "frog_humanoid", "glistening_nose", "grey_nose", "heterochromia", "male_humanoid", "mouth_full", "muscular_male", "pink_countershading", "purple_border", "purple_tongue", "striped_face", "swinging", "tail_tuft", "tan_fur", "tan_tail", "tanuki_humanoid", "tongue", "two_tone_tail", "wavy_tail", "white_stripes", "white_tail"], "stage3_selected_scores": {"tongue": 0.3351, "muscular_male": 0.2998, "tan_fur": 0.4046, "body_hair": 0.2971, "tail_tuft": 0.4789, "white_tail": 0.4815, "heterochromia": 0.4217, "two_tone_tail": 0.4801, "blue_pawpads": 0.4689, "tan_tail": 0.4917, "blue_ears": 0.4793, "purple_tongue": 0.4737, "grey_nose": 0.4276, "glistening_nose": 0.4234, "blue_stripes": 0.5367, "corpse": 0.3037, "fighting_pose": 0.4379, "white_stripes": 0.532, "purple_border": 0.513, "blue_paws": 0.4653, "striped_face": 0.5338, "tanuki_humanoid": 0.77, "wavy_tail": 0.4877, "frog_humanoid": 0.5239, "pink_countershading": 0.493, "male_humanoid": 0.5449, "crosslegged_pose": 0.4261, "swinging": 0.3357, "blinking": 0.3357, "fetal_pose": 0.4288, "mouth_full": 0.4434, "fennec_humanoid": 0.7856}, "stage3_selected_ranks": {"tongue": 183, "muscular_male": 189, "tan_fur": 148, "body_hair": 190, "tail_tuft": 91, "white_tail": 85, "heterochromia": 142, "two_tone_tail": 88, "blue_pawpads": 98, "tan_tail": 71, "blue_ears": 89, "purple_tongue": 94, "grey_nose": 138, "glistening_nose": 141, "blue_stripes": 47, "corpse": 188, "fighting_pose": 128, "white_stripes": 51, "purple_border": 63, "blue_paws": 101, "striped_face": 50, "tanuki_humanoid": 7, "wavy_tail": 75, "frog_humanoid": 54, "pink_countershading": 70, "male_humanoid": 43, "crosslegged_pose": 139, "swinging": 181, "blinking": 182, "fetal_pose": 137, "mouth_full": 126, "fennec_humanoid": 6}, "stage3_selected_phrase_ranks": {"tongue": 10, "muscular_male": 9, "tan_fur": 8, "body_hair": 10, "tail_tuft": 3, "white_tail": 8, "heterochromia": 6, "two_tone_tail": 9, "blue_pawpads": 6, "tan_tail": 6, "blue_ears": 7, "purple_tongue": 9, "grey_nose": 7, "glistening_nose": 9, "blue_stripes": 2, "corpse": 8, "fighting_pose": 5, "white_stripes": 4, "purple_border": 6, "blue_paws": 8, "striped_face": 6, "tanuki_humanoid": 7, "wavy_tail": 7, "frog_humanoid": 10, "pink_countershading": 9, "male_humanoid": 9, "crosslegged_pose": 10, "swinging": 8, "blinking": 9, "fetal_pose": 9, "mouth_full": 9, "fennec_humanoid": 6}, "extra_evidence": {"amphibian": {"source": "implied"}, "amphibian_humanoid": {"source": "implied"}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "blinking": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3357}, "blue_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4793}, "blue_pawpads": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4689}, "blue_paws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4653}, "blue_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5367}, "body_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.2971}, "border": {"source": "implied"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "implied"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "corpse": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3037}, "countershading": {"source": "implied"}, "crosslegged_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4261}, "curled_up": {"source": "implied"}, "facial_markings": {"source": "implied"}, "fennec_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7856}, "fetal_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4288}, "fighting_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4379}, "fox_humanoid": {"source": "implied"}, "frog_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5239}, "glistening": {"source": "implied"}, "glistening_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4234}, "grey_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4276}, "head_markings": {"source": "implied"}, "heterochromia": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4217}, "humanoid": {"source": "implied"}, "male": {"source": "structural"}, "male_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5449}, "mammal_humanoid": {"source": "implied"}, "markings": {"source": "implied"}, "mouth_full": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4434}, "muscular": {"source": "implied"}, "muscular_male": {"source": "stage3", "why": "explicit", "retrieval_score": 0.2998}, "pawpads": {"source": "implied"}, "pink_body": {"source": "implied"}, "pink_countershading": {"source": "stage3", "why": "explicit", "retrieval_score": 0.493}, "pose": {"source": "implied"}, "purple_border": {"source": "stage3", "why": "explicit", "retrieval_score": 0.513}, "purple_tongue": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4737}, "striped_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5338}, "stripes": {"source": "implied"}, "swinging": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3357}, "tail": {"source": "implied"}, "tail_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4789}, "tan_body": {"source": "implied"}, "tan_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4046}, "tan_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4917}, "tanuki_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.77}, "tongue": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3351}, "tuft": {"source": "implied"}, "two_tone_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4801}, "wavy_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4877}, "white_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.532}, "white_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4815}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 1.95, "t2": 1.8, "t3": 64.08, "t3s": 1.95, "t3p": 4.82, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=186 entity=2 copyright_filtered=2 generic_char_to_general=4 unknown_type=12", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"weak_implied\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 42, \"why\": \"style_or_meta\"}, {\"i\": 44, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 59, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"explicit\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 57}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"other\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 65}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: gave up after 3 attempts"]}
data/eval_results/latency_chunk60_k6_seed42.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T05:59:20.226851", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 6, "temperature": 0.0, "shuffle": false, "seed": 42, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 27}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 6, "n_selected": 9, "n_implied": 1, "n_structural": 4, "n_probe": 5, "ret_R": 0.0, "P": 0.7778, "R": 0.5833, "F1": 0.6667, "leaf_P": 0.5714, "leaf_R": 0.4444, "leaf_F1": 0.5, "n_leaf_sel": 7, "n_leaf_gt": 9, "ret_P": 0.0, "sel_given_ret": 0.0, "over_sel": 0.75, "why": {"strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 2, "attempts_by_n_local": {"6": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7778, "gen_R": 0.5833, "gen_F1": 0.6667, "missed": ["alpha_channel", "feline", "fingers", "fur", "hair"], "extra": ["humor", "text"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "clothed", "clothing", "felid", "humor", "male", "mammal", "solo", "text"], "stage3_selected": ["humor"], "stage3_selected_scores": {"humor": 0.4396}, "stage3_selected_ranks": {"humor": 3}, "stage3_selected_phrase_ranks": {"humor": 3}, "extra_evidence": {"humor": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4396}, "text": {"source": "probe"}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 5.4, "t2": 2.04, "t3": 0.67, "t3s": 3.86, "t3p": 4.31, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=6 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 6, "n_selected": 14, "n_implied": 2, "n_structural": 7, "n_probe": 5, "ret_R": 0.0, "P": 0.4286, "R": 0.4286, "F1": 0.4286, "leaf_P": 0.2, "leaf_R": 0.2, "leaf_F1": 0.2, "n_leaf_sel": 10, "n_leaf_gt": 10, "ret_P": 0.0, "sel_given_ret": 0.0, "over_sel": 1.0, "why": {"strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 2, "attempts_by_n_local": {"6": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4286, "gen_R": 0.4286, "gen_F1": 0.4286, "missed": ["ape", "dancing", "fur", "hair", "haplorhine", "human", "male", "primate"], "extra": ["anthro", "crossover", "duo", "feral", "humor", "text", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "bear", "clothed", "clothing", "crossover", "duo", "feral", "group", "humor", "looking_at_viewer", "mammal", "text", "topless", "trio"], "stage3_selected": ["crossover", "humor"], "stage3_selected_scores": {"humor": 0.3489, "crossover": 0.3287}, "stage3_selected_ranks": {"humor": 4, "crossover": 6}, "stage3_selected_phrase_ranks": {"humor": 4, "crossover": 6}, "extra_evidence": {"anthro": {"source": "structural"}, "crossover": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.3287}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "humor": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.3489}, "text": {"source": "structural"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "feral", "clothed", "topless", "looking_at_viewer", "text"], "probe": ["anthro", "duo", "group", "bear", "simple_background"], "t1": 6.1, "t2": 1.48, "t3": 0.78, "t3s": 6.44, "t3p": 4.25, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=6 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 83, "n_selected": 23, "n_implied": 3, "n_structural": 4, "n_probe": 5, "ret_R": 0.6429, "P": 0.4348, "R": 0.7143, "F1": 0.5405, "leaf_P": 0.4118, "leaf_R": 0.7778, "leaf_F1": 0.5385, "n_leaf_sel": 17, "n_leaf_gt": 9, "ret_P": 0.1084, "sel_given_ret": 1.1111, "over_sel": 1.64, "why": {"explicit": 13, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 42, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "24": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4348, "gen_R": 0.7143, "gen_F1": 0.5405, "missed": ["lagomorph", "leporid", "mammal", "rabbit"], "extra": ["<3", "coat", "expressions", "holding_object", "holding_plushie", "intimate", "lab_coat", "looking_at_viewer", "love", "relationship", "shocked_expression", "topwear", "vest"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "expressions", "holding_object", "holding_plushie", "intimate", "lab_coat", "looking_at_viewer", "love", "plushie", "relationship", "romantic", "romantic_couple", "shocked_expression", "teal_eyes", "topwear", "vest"], "stage3_selected": ["blue_eyes", "coat", "duo", "expressions", "holding_plushie", "intimate", "lab_coat", "love", "plushie", "relationship", "romantic_couple", "shocked_expression", "teal_eyes", "vest"], "stage3_selected_scores": {"duo": 0.3628, "blue_eyes": 0.6151, "romantic_couple": 0.5621, "coat": 0.6383, "plushie": 0.7455, "vest": 0.5028, "love": 0.4693, "teal_eyes": 0.6283, "lab_coat": 0.516, "intimate": 0.4403, "expressions": 0.5454, "holding_plushie": 0.7793, "relationship": 0.6206, "shocked_expression": 0.5745}, "stage3_selected_ranks": {"duo": 81, "blue_eyes": 12, "romantic_couple": 18, "coat": 7, "plushie": 3, "vest": 42, "love": 55, "teal_eyes": 8, "lab_coat": 38, "intimate": 70, "expressions": 23, "holding_plushie": 2, "relationship": 9, "shocked_expression": 16}, "stage3_selected_phrase_ranks": {"duo": 3, "blue_eyes": 1, "romantic_couple": 1, "coat": 1, "plushie": 1, "vest": 6, "love": 5, "teal_eyes": 1, "lab_coat": 5, "intimate": 6, "expressions": 2, "holding_plushie": 1, "relationship": 1, "shocked_expression": 4}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5454}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "intimate": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4403}, "lab_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.516}, "looking_at_viewer": {"source": "structural"}, "love": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4693}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "shocked_expression": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5745}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5028}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 2.83, "t2": 1.55, "t3": 14.48, "t3s": 3.46, "t3p": 8.24, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=84 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 80, "n_selected": 10, "n_implied": 0, "n_structural": 4, "n_probe": 3, "ret_R": 0.75, "P": 0.3, "R": 0.75, "F1": 0.4286, "leaf_P": 0.3, "leaf_R": 0.75, "leaf_F1": 0.4286, "n_leaf_sel": 10, "n_leaf_gt": 4, "ret_P": 0.0375, "sel_given_ret": 1.0, "over_sel": 2.5, "why": {"explicit": 6}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 1, "calls_exhausted_retries": 1, "attempts_total": 4, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 4, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "24": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.5}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3, "gen_R": 0.75, "gen_F1": 0.4286, "missed": ["tan_body"], "extra": ["ambiguous_gender", "anthro", "eyes", "feral", "floating", "nude", "round_eyes"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "eyes", "feral", "floating", "nude", "red_nose", "round_eyes", "smile", "solo"], "stage3_selected": ["eyes", "floating", "red_nose", "round_eyes", "simple_background", "smile"], "stage3_selected_scores": {"simple_background": 0.5334, "smile": 0.6084, "red_nose": 0.7451, "floating": 0.6767, "round_eyes": 0.8853, "eyes": 0.929}, "stage3_selected_ranks": {"simple_background": 54, "smile": 34, "red_nose": 4, "floating": 18, "round_eyes": 2, "eyes": 1}, "stage3_selected_phrase_ranks": {"simple_background": 6, "smile": 2, "red_nose": 1, "floating": 1, "round_eyes": 1, "eyes": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.929}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6767}, "nude": {"source": "structural"}, "round_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8853}}, "structural": ["solo", "feral", "ambiguous_gender", "nude"], "probe": ["simple_background", "anthro", "solo"], "t1": 2.71, "t2": 1.42, "t3": 8.73, "t3s": 3.78, "t3p": 8.48, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=84 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4", "Stage3 general_chunk_1: gave up after 3 attempts"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 108, "n_selected": 34, "n_implied": 12, "n_structural": 4, "n_probe": 3, "ret_R": 0.3636, "P": 0.5, "R": 0.7727, "F1": 0.6071, "leaf_P": 0.1579, "leaf_R": 0.25, "leaf_F1": 0.1935, "n_leaf_sel": 19, "n_leaf_gt": 12, "ret_P": 0.0741, "sel_given_ret": 2.125, "over_sel": 1.55, "why": {"explicit": 10, "strong_implied": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 36, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "47": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.7727, "gen_F1": 0.6071, "missed": ["chest_tuft", "muscular", "muscular_anthro", "muscular_male", "topless"], "extra": ["belly", "bengal_tiger", "countershade_belly", "countershade_body", "gesture", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hand_on_own_head", "hotpants", "minishorts", "muscular_legs", "quads", "striped_body", "striped_fur", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "belly", "bengal_tiger", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_belly", "countershade_body", "countershading", "felid", "fur", "gesture", "glistening", "glistening_body", "glistening_eyes", "glistening_fur", "hand_on_head", "hand_on_own_head", "hotpants", "male", "mammal", "minishorts", "muscular_legs", "pantherine", "quads", "shorts", "solo", "striped_body", "striped_fur", "stripes", "tiger", "tuft", "white_chest"], "stage3_selected": ["bengal_tiger", "blue_eyes", "countershade_belly", "countershade_body", "gesture", "glistening_eyes", "glistening_fur", "hand_on_head", "hand_on_own_head", "minishorts", "muscular_legs", "quads", "shorts", "striped_fur", "stripes", "tuft", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5727, "tuft": 0.4907, "stripes": 0.469, "shorts": 0.58, "gesture": 0.5868, "hand_on_head": 0.5941, "quads": 0.6704, "bengal_tiger": 0.4385, "minishorts": 0.5208, "hand_on_own_head": 0.526, "striped_fur": 0.6394, "glistening_eyes": 0.4754, "glistening_fur": 0.4988, "muscular_legs": 0.7895, "white_chest": 0.92, "countershade_body": 0.8756, "countershade_belly": 0.8307}, "stage3_selected_ranks": {"blue_eyes": 47, "tuft": 68, "stripes": 85, "shorts": 46, "gesture": 44, "hand_on_head": 40, "quads": 23, "bengal_tiger": 94, "minishorts": 55, "hand_on_own_head": 51, "striped_fur": 29, "glistening_eyes": 82, "glistening_fur": 65, "muscular_legs": 11, "white_chest": 2, "countershade_body": 3, "countershade_belly": 9}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "tuft": 4, "stripes": 3, "shorts": 1, "gesture": 1, "hand_on_head": 2, "quads": 4, "bengal_tiger": 5, "minishorts": 6, "hand_on_own_head": 4, "striped_fur": 2, "glistening_eyes": 3, "glistening_fur": 3, "muscular_legs": 2, "white_chest": 1, "countershade_body": 1, "countershade_belly": 2}, "extra_evidence": {"belly": {"source": "implied"}, "bengal_tiger": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4385}, "countershade_belly": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8307}, "countershade_body": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8756}, "gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5868}, "glistening": {"source": "implied"}, "glistening_body": {"source": "implied"}, "glistening_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4754}, "glistening_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4988}, "hand_on_own_head": {"source": "stage3", "why": "explicit", "retrieval_score": 0.526}, "hotpants": {"source": "implied"}, "minishorts": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5208}, "muscular_legs": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7895}, "quads": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6704}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6394}, "white_chest": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.92}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "felid", "solo"], "t1": 1.55, "t2": 1.88, "t3": 13.52, "t3s": 2.53, "t3p": 1.33, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=107 entity=1 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 124, "n_selected": 15, "n_implied": 2, "n_structural": 0, "n_probe": 4, "ret_R": 0.6154, "P": 0.4, "R": 0.4615, "F1": 0.4286, "leaf_P": 0.2727, "leaf_R": 0.5, "leaf_F1": 0.3529, "n_leaf_sel": 11, "n_leaf_gt": 6, "ret_P": 0.0645, "sel_given_ret": 0.75, "over_sel": 1.15, "why": {"explicit": 4, "strong_implied": 6}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 1, "calls_exhausted_retries": 1, "attempts_total": 5, "attempt_errors": 4, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 24, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "58": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}}, "attempt_failure_rate": 0.8, "call_exhaustion_rate": 0.5}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4, "gen_R": 0.4615, "gen_F1": 0.4286, "missed": ["dialogue", "fur", "lizard", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothing", "dark_theme", "darkness", "group", "knife", "light", "medieval_fantasy", "trio"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "bovid", "caprine", "clothing", "dark_theme", "darkness", "goat", "group", "human", "knife", "light", "mammal", "medieval_fantasy", "text", "trio"], "stage3_selected": ["bovid", "dark_theme", "darkness", "goat", "group", "human", "knife", "light", "medieval_fantasy", "trio"], "stage3_selected_scores": {"light": 0.7782, "darkness": 0.8346, "dark_theme": 0.5937, "medieval_fantasy": 0.4783, "group": 0.6233, "human": 0.6639, "bovid": 0.5984, "trio": 0.5291, "goat": 0.7749, "knife": 0.5268}, "stage3_selected_ranks": {"light": 4, "darkness": 2, "dark_theme": 41, "medieval_fantasy": 114, "group": 29, "human": 19, "bovid": 39, "trio": 82, "goat": 5, "knife": 84}, "stage3_selected_phrase_ranks": {"light": 1, "darkness": 1, "dark_theme": 2, "medieval_fantasy": 5, "group": 1, "human": 1, "bovid": 3, "trio": 2, "goat": 1, "knife": 3}, "extra_evidence": {"anthro": {"source": "probe"}, "clothing": {"source": "probe"}, "dark_theme": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5937}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8346}, "group": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6233}, "knife": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5268}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7782}, "medieval_fantasy": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4783}, "trio": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5291}}, "structural": [], "probe": ["clothing", "anthro", "text", "group"], "t1": 1.75, "t2": 1.89, "t3": 79.86, "t3s": 0.66, "t3p": 1.77, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=118 entity=1 copyright_filtered=5 generic_char_to_general=2 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"other\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"other\"}, {\"i\": 12, \"why\": \"other\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"other\"}, {\"i\": 19, \"why\": \"other\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"other\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.36.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.36.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"other\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"other\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 8, \"why\": \"style_or_meta\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 11, \"why\": \"other\"}, {\"i\": 12, \"why\": \"style_or_meta\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 14, \"why\": \"other\"}, {\"i\": 15, \"why\": \"other\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 18, \"why\": \"style_or_meta\"}, {\"i\": 19, \"why\": \"other\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 24, \"why\": \"style_or_meta\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"style_or_meta\"}, {\"i\": 29, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"style_or_meta\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 37}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 37}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"other\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"other\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"style_or_meta\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 14, \"why\": \"other\"}, {\"i\": 15, \"why\": \"other\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"style_or_meta\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"style_or_meta\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"style_or_meta\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 36, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"other\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"other\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 8, \"why\": \"style_or_meta\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 11, \"why\": \"other\"}, {\"i\": 12, \"why\": \"style_or_meta\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 14, \"why\": \"other\"}, {\"i\": 15, \"why\": \"other\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 18, \"why\": \"style_or_meta\"}, {\"i\": 19, \"why\": \"other\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 24, \"why\": \"style_or_meta\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"style_or_meta\"}, {\"i\": 29, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"style_or_meta\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 37}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 37}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: gave up after 3 attempts"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 128, "n_selected": 36, "n_implied": 7, "n_structural": 3, "n_probe": 3, "ret_R": 0.6, "P": 0.3333, "R": 0.8, "F1": 0.4706, "leaf_P": 0.2692, "leaf_R": 0.5833, "leaf_F1": 0.3684, "n_leaf_sel": 26, "n_leaf_gt": 12, "ret_P": 0.0703, "sel_given_ret": 1.3333, "over_sel": 2.4, "why": {"explicit": 26}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 56, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "11": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3333, "gen_R": 0.8, "gen_F1": 0.4706, "missed": ["angry", "eyes_closed", "sleeping"], "extra": ["annoyed", "annoyed_expression", "anthro", "bed_covers", "bedding", "bedroom", "big_eyes", "clothing", "english_text", "expressions", "eyebrows", "eyes", "green_eyebrows", "humanoid", "lipstick", "lying_on_bed", "on_bed", "pajamas", "red_lipstick", "relaxed_expression", "scenery", "sleepover", "sleepwear", "watermark"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed", "annoyed_expression", "anthro", "bed", "bed_covers", "bedding", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "clothing", "duo", "english_text", "expressions", "eyebrows", "eyes", "eyeshadow", "furniture", "green_eyebrows", "green_eyes", "hair", "humanoid", "lipstick", "lying", "lying_on_bed", "makeup", "on_bed", "pajamas", "purple_hair", "red_lipstick", "relaxed_expression", "scenery", "sleepover", "sleepwear", "text", "watermark"], "stage3_selected": ["annoyed", "annoyed_expression", "bed_covers", "bedroom", "big_eyes", "blonde_hair", "blue_eyes", "english_text", "expressions", "eyes", "eyeshadow", "green_eyebrows", "green_eyes", "hair", "lipstick", "lying_on_bed", "makeup", "pajamas", "purple_hair", "red_lipstick", "relaxed_expression", "scenery", "sleepover", "sleepwear", "text", "watermark"], "stage3_selected_scores": {"hair": 0.6031, "text": 0.6007, "blue_eyes": 0.6014, "green_eyes": 0.5989, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "eyeshadow": 0.4763, "watermark": 0.6042, "lipstick": 0.4874, "bedroom": 0.4901, "big_eyes": 0.4289, "annoyed": 0.5727, "lying_on_bed": 0.4093, "pajamas": 0.3753, "red_lipstick": 0.4709, "scenery": 0.4936, "annoyed_expression": 0.7251, "bed_covers": 0.4145, "expressions": 0.5439, "green_eyebrows": 0.5014, "sleepover": 0.5269, "sleepwear": 0.4462, "relaxed_expression": 0.4534, "eyes": 0.8951, "english_text": 0.4189}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 14, "makeup": 11, "eyeshadow": 46, "watermark": 4, "lipstick": 40, "bedroom": 38, "big_eyes": 74, "annoyed": 13, "lying_on_bed": 94, "pajamas": 112, "red_lipstick": 49, "scenery": 36, "annoyed_expression": 2, "bed_covers": 89, "expressions": 18, "green_eyebrows": 31, "sleepover": 26, "sleepwear": 62, "relaxed_expression": 55, "eyes": 1, "english_text": 85}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "watermark": 1, "lipstick": 2, "bedroom": 1, "big_eyes": 3, "annoyed": 2, "lying_on_bed": 4, "pajamas": 3, "red_lipstick": 5, "scenery": 2, "annoyed_expression": 1, "bed_covers": 3, "expressions": 3, "green_eyebrows": 2, "sleepover": 1, "sleepwear": 6, "relaxed_expression": 6, "eyes": 1, "english_text": 4}, "extra_evidence": {"annoyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5727}, "annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "bed_covers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4145}, "bedding": {"source": "implied"}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4289}, "clothing": {"source": "implied"}, "english_text": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4189}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5439}, "eyebrows": {"source": "implied"}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8951}, "green_eyebrows": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5014}, "humanoid": {"source": "structural"}, "lipstick": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4874}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4093}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3753}, "red_lipstick": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4709}, "relaxed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4534}, "scenery": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4936}, "sleepover": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5269}, "sleepwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4462}, "watermark": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6042}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 3.37, "t2": 1.96, "t3": 29.39, "t3s": 2.54, "t3p": 6.41, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=131 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 93, "n_selected": 26, "n_implied": 10, "n_structural": 4, "n_probe": 3, "ret_R": 0.2727, "P": 0.4615, "R": 0.5455, "F1": 0.5, "leaf_P": 0.4, "leaf_R": 0.4615, "leaf_F1": 0.4286, "n_leaf_sel": 15, "n_leaf_gt": 13, "ret_P": 0.0645, "sel_given_ret": 2.0, "over_sel": 1.18, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 57, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "38": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4615, "gen_R": 0.5455, "gen_F1": 0.5, "missed": ["bass_guitar", "canine", "fur", "guitar", "holding_musical_instrument", "holding_object", "music", "musical_instrument", "plucked_string_instrument", "string_instrument"], "extra": ["accessory", "bottomwear", "flowing_hair", "hair_accessory", "hair_tie", "holding_hair", "long_tail", "looking_at_viewer", "playful", "playing_bass", "shorts", "torn_bottomwear", "torn_shorts", "touching_hair"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["accessory", "anthro", "bottomwear", "canid", "claws", "clothed", "clothing", "fingers", "flowing_hair", "hair", "hair_accessory", "hair_tie", "holding_hair", "long_tail", "looking_at_viewer", "mammal", "playful", "playing_bass", "shorts", "solo", "spade_tail", "tail", "torn_bottomwear", "torn_clothing", "torn_shorts", "touching_hair"], "stage3_selected": ["claws", "fingers", "flowing_hair", "hair_tie", "holding_hair", "long_tail", "playful", "playing_bass", "spade_tail", "torn_clothing", "torn_shorts"], "stage3_selected_scores": {"claws": 0.6306, "fingers": 0.5071, "torn_clothing": 0.4536, "long_tail": 0.502, "spade_tail": 0.8721, "hair_tie": 0.4654, "playful": 0.3766, "torn_shorts": 0.4535, "flowing_hair": 0.702, "holding_hair": 0.4924, "playing_bass": 0.5052}, "stage3_selected_ranks": {"claws": 5, "fingers": 41, "torn_clothing": 68, "long_tail": 45, "spade_tail": 1, "hair_tie": 62, "playful": 88, "torn_shorts": 69, "flowing_hair": 2, "holding_hair": 51, "playing_bass": 42}, "stage3_selected_phrase_ranks": {"claws": 1, "fingers": 4, "torn_clothing": 3, "long_tail": 5, "spade_tail": 1, "hair_tie": 3, "playful": 4, "torn_shorts": 4, "flowing_hair": 1, "holding_hair": 3, "playing_bass": 1}, "extra_evidence": {"accessory": {"source": "implied"}, "bottomwear": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.702}, "hair_accessory": {"source": "implied"}, "hair_tie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4654}, "holding_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4924}, "long_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.502}, "looking_at_viewer": {"source": "structural"}, "playful": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3766}, "playing_bass": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5052}, "shorts": {"source": "implied"}, "torn_bottomwear": {"source": "implied"}, "torn_shorts": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4535}, "touching_hair": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 8.84, "t2": 1.31, "t3": 17.56, "t3s": 0.65, "t3p": 5.44, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=98 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=3"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 59, "n_selected": 17, "n_implied": 6, "n_structural": 4, "n_probe": 3, "ret_R": 0.36, "P": 0.6471, "R": 0.44, "F1": 0.5238, "leaf_P": 0.5556, "leaf_R": 0.3333, "leaf_F1": 0.4167, "n_leaf_sel": 9, "n_leaf_gt": 15, "ret_P": 0.1525, "sel_given_ret": 1.2222, "over_sel": 0.68, "why": {"explicit": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 23, "attempts_by_n_local": {"58": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6471, "gen_R": 0.44, "gen_F1": 0.5238, "missed": ["bottomwear", "canid", "canine", "crossed_arms", "fox", "grey_background", "lagomorph", "leporid", "looking_at_another", "mammal", "overalls", "pants", "rabbit", "standing"], "extra": ["4_claws", "5_claws", "looking_at_viewer", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["4_claws", "5_claws", "anthro", "claws", "clothed", "clothing", "duo", "facial_markings", "fur", "head_markings", "looking_at_viewer", "markings", "shirt", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["4_claws", "5_claws", "claws", "facial_markings", "fur", "shirt", "white_shirt"], "stage3_selected_scores": {"fur": 0.7019, "claws": 0.6694, "shirt": 0.7044, "facial_markings": 0.9019, "white_shirt": 0.5527, "4_claws": 0.6125, "5_claws": 0.6238}, "stage3_selected_ranks": {"fur": 11, "claws": 13, "shirt": 10, "facial_markings": 1, "white_shirt": 48, "4_claws": 21, "5_claws": 20}, "stage3_selected_phrase_ranks": {"fur": 1, "claws": 1, "shirt": 1, "facial_markings": 1, "white_shirt": 4, "4_claws": 3, "5_claws": 1}, "extra_evidence": {"4_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6125}, "5_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6238}, "looking_at_viewer": {"source": "structural"}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5527}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 31.27, "t2": 0.92, "t3": 15.93, "t3s": 0.43, "t3p": 11.26, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "LLM rewrite: fallback (error: ReadTimeout: The read operation timed out)", "Stage3 split: general=58 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 127, "n_selected": 34, "n_implied": 9, "n_structural": 5, "n_probe": 3, "ret_R": 0.5455, "P": 0.2647, "R": 0.8182, "F1": 0.4, "leaf_P": 0.2, "leaf_R": 0.5714, "leaf_F1": 0.2963, "n_leaf_sel": 20, "n_leaf_gt": 7, "ret_P": 0.0472, "sel_given_ret": 1.5, "over_sel": 3.09, "why": {"explicit": 8, "strong_implied": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 55, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "12": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2647, "gen_R": 0.8182, "gen_F1": 0.4, "missed": ["blue_nose", "open_mouth"], "extra": ["animal_humanoid", "anime_eyes", "anthro", "big_eyes", "blue_ears", "blue_stripes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "cute_expression", "fennec_humanoid", "fluffy_fur", "fox_humanoid", "humanoid", "intersex", "jumper", "looking_at_viewer", "mammal_humanoid", "multi_tone_fur", "multicolored_body", "multicolored_fur", "pink_ears", "pink_stripes", "stripes"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anime_eyes", "anthro", "big_eyes", "blue_ears", "blue_eyes", "blue_stripes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "cute_expression", "fennec_humanoid", "fluffy_fur", "fox_humanoid", "fur", "humanoid", "intersex", "jumper", "looking_at_viewer", "mammal", "mammal_humanoid", "multi_tone_fur", "multicolored_body", "multicolored_fur", "pink_ears", "pink_stripes", "purple_body", "solo", "stripes", "white_body", "white_fur"], "stage3_selected": ["animal_humanoid", "anime_eyes", "big_eyes", "blue_ears", "blue_eyes", "blue_stripes", "canid_humanoid", "canine_humanoid", "cute_expression", "fennec_humanoid", "fluffy_fur", "fox_humanoid", "jumper", "multi_tone_fur", "multicolored_fur", "pink_ears", "pink_stripes", "purple_body", "simple_background", "white_fur"], "stage3_selected_scores": {"simple_background": 0.5994, "blue_eyes": 0.6045, "white_fur": 0.6039, "purple_body": 0.5693, "canine_humanoid": 0.9013, "blue_stripes": 0.6786, "pink_stripes": 0.6846, "fennec_humanoid": 0.7743, "multicolored_fur": 0.5035, "animal_humanoid": 0.6191, "canid_humanoid": 0.863, "fox_humanoid": 0.8214, "big_eyes": 0.4219, "blue_ears": 0.5093, "cute_expression": 0.4501, "pink_ears": 0.5282, "multi_tone_fur": 0.5185, "fluffy_fur": 0.5591, "anime_eyes": 0.4933, "jumper": 0.4075}, "stage3_selected_ranks": {"simple_background": 29, "blue_eyes": 24, "white_fur": 25, "purple_body": 41, "canine_humanoid": 1, "blue_stripes": 10, "pink_stripes": 9, "fennec_humanoid": 6, "multicolored_fur": 76, "animal_humanoid": 15, "canid_humanoid": 2, "fox_humanoid": 3, "big_eyes": 112, "blue_ears": 73, "cute_expression": 107, "pink_ears": 62, "multi_tone_fur": 67, "fluffy_fur": 48, "anime_eyes": 82, "jumper": 116}, "stage3_selected_phrase_ranks": {"simple_background": 1, "blue_eyes": 1, "white_fur": 1, "purple_body": 2, "canine_humanoid": 1, "blue_stripes": 1, "pink_stripes": 1, "fennec_humanoid": 6, "multicolored_fur": 6, "animal_humanoid": 2, "canid_humanoid": 2, "fox_humanoid": 3, "big_eyes": 6, "blue_ears": 5, "cute_expression": 6, "pink_ears": 5, "multi_tone_fur": 5, "fluffy_fur": 2, "anime_eyes": 2, "jumper": 3}, "extra_evidence": {"animal_humanoid": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6191}, "anime_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4933}, "anthro": {"source": "structural"}, "big_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4219}, "blue_ears": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5093}, "blue_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6786}, "canid_humanoid": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.863}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9013}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "cute_expression": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4501}, "fennec_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7743}, "fluffy_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5591}, "fox_humanoid": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8214}, "humanoid": {"source": "implied"}, "intersex": {"source": "structural"}, "jumper": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4075}, "looking_at_viewer": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "multi_tone_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5185}, "multicolored_body": {"source": "implied"}, "multicolored_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5035}, "pink_ears": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5282}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6846}, "stripes": {"source": "implied"}}, "structural": ["solo", "anthro", "intersex", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 7.78, "t2": 1.99, "t3": 32.33, "t3s": 0.94, "t3p": 3.05, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=132 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
data/eval_results/latency_chunk60_k6_seed43.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T06:01:27.775437", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 6, "temperature": 0.0, "shuffle": false, "seed": 43, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 26}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 92, "n_selected": 51, "n_implied": 23, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.1765, "R": 0.75, "F1": 0.2857, "leaf_P": 0.12, "leaf_R": 0.3333, "leaf_F1": 0.1765, "n_leaf_sel": 25, "n_leaf_gt": 9, "ret_P": 0.0326, "sel_given_ret": 3.0, "over_sel": 4.25, "why": {"explicit": 14, "strong_implied": 8}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 43, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "35": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1765, "gen_R": 0.75, "gen_F1": 0.2857, "missed": ["alpha_channel", "fingers", "male"], "extra": ["beer_mug", "black_hands", "bottom_heavy", "brown_clothing", "brown_coat", "brown_topwear", "business_attire", "coat", "container", "cup", "domestic_cat", "dress_shirt", "felis", "formal", "gesture", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "handshake", "holding_container", "holding_cup", "holding_object", "huge_hips", "looking_at_viewer", "necktie", "pockets", "shirt", "sweater", "sweater_vest", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_body", "white_clothing", "white_dress_shirt", "white_fur", "white_necktie", "white_shirt", "white_topwear"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "beer_mug", "black_hands", "bottom_heavy", "brown_clothing", "brown_coat", "brown_topwear", "business_attire", "clothed", "clothing", "coat", "container", "cup", "domestic_cat", "dress_shirt", "felid", "feline", "felis", "formal", "fur", "gesture", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "handshake", "holding_container", "holding_cup", "holding_object", "huge_hips", "looking_at_viewer", "mammal", "necktie", "pockets", "shirt", "solo", "sweater", "sweater_vest", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_body", "white_clothing", "white_dress_shirt", "white_fur", "white_necktie", "white_shirt", "white_topwear"], "stage3_selected": ["beer_mug", "black_hands", "bottom_heavy", "brown_coat", "business_attire", "domestic_cat", "dress_shirt", "formal", "grey_shirt", "hair_bun", "handshake", "holding_container", "holding_cup", "huge_hips", "pockets", "simple_background", "sweater_vest", "teal_shirt", "white_dress_shirt", "white_fur", "white_necktie", "white_shirt"], "stage3_selected_scores": {"simple_background": 0.6979, "white_shirt": 0.738, "holding_container": 0.76, "dress_shirt": 0.7242, "hair_bun": 0.6927, "bottom_heavy": 0.4664, "grey_shirt": 0.7582, "sweater_vest": 0.7533, "white_dress_shirt": 0.6881, "handshake": 0.5512, "formal": 0.5993, "business_attire": 0.5658, "teal_shirt": 0.7475, "white_necktie": 0.6418, "white_fur": 0.598, "domestic_cat": 0.633, "huge_hips": 0.4406, "pockets": 0.6095, "holding_cup": 0.7668, "black_hands": 0.4563, "beer_mug": 0.6599, "brown_coat": 0.7267}, "stage3_selected_ranks": {"simple_background": 35, "white_shirt": 20, "holding_container": 11, "dress_shirt": 25, "hair_bun": 37, "bottom_heavy": 90, "grey_shirt": 13, "sweater_vest": 16, "white_dress_shirt": 38, "handshake": 75, "formal": 59, "business_attire": 70, "teal_shirt": 19, "white_necktie": 44, "white_fur": 60, "domestic_cat": 48, "huge_hips": 93, "pockets": 56, "holding_cup": 10, "black_hands": 91, "beer_mug": 41, "brown_coat": 24}, "stage3_selected_phrase_ranks": {"simple_background": 1, "white_shirt": 4, "holding_container": 5, "dress_shirt": 2, "hair_bun": 1, "bottom_heavy": 4, "grey_shirt": 1, "sweater_vest": 3, "white_dress_shirt": 5, "handshake": 3, "formal": 1, "business_attire": 1, "teal_shirt": 4, "white_necktie": 6, "white_fur": 3, "domestic_cat": 4, "huge_hips": 5, "pockets": 3, "holding_cup": 4, "black_hands": 6, "beer_mug": 5, "brown_coat": 5}, "extra_evidence": {"beer_mug": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6599}, "black_hands": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4563}, "bottom_heavy": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4664}, "brown_clothing": {"source": "implied"}, "brown_coat": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7267}, "brown_topwear": {"source": "implied"}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5658}, "coat": {"source": "implied"}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "domestic_cat": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.633}, "dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7242}, "felis": {"source": "implied"}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5993}, "gesture": {"source": "implied"}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7582}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6927}, "handshake": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5512}, "holding_container": {"source": "stage3", "why": "explicit", "retrieval_score": 0.76}, "holding_cup": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7668}, "holding_object": {"source": "implied"}, "huge_hips": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4406}, "looking_at_viewer": {"source": "structural"}, "necktie": {"source": "implied"}, "pockets": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6095}, "shirt": {"source": "implied"}, "sweater": {"source": "implied"}, "sweater_vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7533}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7475}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "implied"}, "white_body": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6881}, "white_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.598}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6418}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.738}, "white_topwear": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 3.57, "t2": 4.55, "t3": 14.76, "t3s": 6.34, "t3p": 5.75, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=95 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 98, "n_selected": 19, "n_implied": 6, "n_structural": 0, "n_probe": 6, "ret_R": 0.5714, "P": 0.4737, "R": 0.6429, "F1": 0.5455, "leaf_P": 0.3636, "leaf_R": 0.4, "leaf_F1": 0.381, "n_leaf_sel": 11, "n_leaf_gt": 10, "ret_P": 0.0816, "sel_given_ret": 1.125, "over_sel": 1.36, "why": {"explicit": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 71, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "41": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5294, "gen_R": 0.6429, "gen_F1": 0.5806, "missed": ["clothed", "fur", "hair", "human", "male"], "extra": ["anthro", "bottomwear", "cheeky", "donkey_kong_(series)", "duo", "gorilla", "kong", "loincloth", "nintendo", "raised_arms"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "bottomwear", "cheeky", "clothing", "dancing", "donkey_kong_(series)", "duo", "gorilla", "group", "haplorhine", "kong", "loincloth", "looking_at_viewer", "mammal", "nintendo", "primate", "raised_arms"], "stage3_selected": ["bear", "cheeky", "dancing", "gorilla", "kong", "loincloth", "looking_at_viewer", "primate", "raised_arms", "simple_background"], "stage3_selected_scores": {"simple_background": 0.5459, "looking_at_viewer": 0.5448, "bear": 0.5727, "primate": 0.8898, "loincloth": 0.5668, "dancing": 0.5549, "raised_arms": 0.543, "gorilla": 0.8294, "kong": 0.7484, "cheeky": 0.3883}, "stage3_selected_ranks": {"simple_background": 13, "looking_at_viewer": 14, "bear": 8, "primate": 2, "loincloth": 9, "dancing": 12, "raised_arms": 15, "gorilla": 4, "kong": 7, "cheeky": 64}, "stage3_selected_phrase_ranks": {"simple_background": 1, "looking_at_viewer": 1, "bear": 1, "primate": 1, "loincloth": 1, "dancing": 1, "raised_arms": 1, "gorilla": 2, "kong": 5, "cheeky": 1}, "extra_evidence": {"anthro": {"source": "probe"}, "bottomwear": {"source": "implied"}, "cheeky": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3883}, "donkey_kong_(series)": {"source": "implied"}, "duo": {"source": "probe"}, "gorilla": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8294}, "kong": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7484}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5668}, "nintendo": {"source": "implied"}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.543}}, "structural": [], "probe": ["clothing", "simple_background", "anthro", "duo", "group", "bear"], "t1": 5.98, "t2": 5.27, "t3": 11.94, "t3s": 5.13, "t3p": 5.56, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=101 entity=2 copyright_filtered=1 generic_char_to_general=1 unknown_type=3"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 79, "n_selected": 20, "n_implied": 3, "n_structural": 4, "n_probe": 5, "ret_R": 0.7143, "P": 0.4, "R": 0.5714, "F1": 0.4706, "leaf_P": 0.4, "leaf_R": 0.6667, "leaf_F1": 0.5, "n_leaf_sel": 15, "n_leaf_gt": 9, "ret_P": 0.1266, "sel_given_ret": 0.8, "over_sel": 1.43, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 36, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4, "gen_R": 0.5714, "gen_F1": 0.4706, "missed": ["lagomorph", "leporid", "mammal", "rabbit", "romantic", "romantic_couple"], "extra": ["<3", "coat", "holding_object", "holding_plushie", "intimate", "lab_coat", "looking_at_viewer", "love", "surprised_expression", "teal_clothing", "topwear", "vest"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "holding_object", "holding_plushie", "intimate", "lab_coat", "looking_at_viewer", "love", "plushie", "surprised_expression", "teal_clothing", "teal_eyes", "topwear", "vest"], "stage3_selected": ["blue_eyes", "blush", "coat", "duo", "holding_plushie", "intimate", "lab_coat", "love", "surprised_expression", "teal_clothing", "teal_eyes", "vest"], "stage3_selected_scores": {"duo": 0.3632, "blush": 0.6084, "blue_eyes": 0.6154, "coat": 0.6386, "vest": 0.503, "love": 0.4696, "teal_eyes": 0.6285, "surprised_expression": 0.6393, "lab_coat": 0.5162, "intimate": 0.4406, "holding_plushie": 0.7794, "teal_clothing": 0.4339}, "stage3_selected_ranks": {"duo": 77, "blush": 13, "blue_eyes": 12, "coat": 7, "vest": 43, "love": 51, "teal_eyes": 8, "surprised_expression": 6, "lab_coat": 37, "intimate": 63, "holding_plushie": 2, "teal_clothing": 65}, "stage3_selected_phrase_ranks": {"duo": 3, "blush": 1, "blue_eyes": 1, "coat": 1, "vest": 6, "love": 5, "teal_eyes": 1, "surprised_expression": 2, "lab_coat": 5, "intimate": 6, "holding_plushie": 1, "teal_clothing": 6}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6386}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7794}, "intimate": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4406}, "lab_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5162}, "looking_at_viewer": {"source": "structural"}, "love": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4696}, "surprised_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6393}, "teal_clothing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4339}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.503}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 3.56, "t2": 1.54, "t3": 8.42, "t3s": 5.22, "t3p": 12.79, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=79 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 67, "n_selected": 15, "n_implied": 0, "n_structural": 3, "n_probe": 4, "ret_R": 0.75, "P": 0.2667, "R": 1.0, "F1": 0.4211, "leaf_P": 0.2667, "leaf_R": 1.0, "leaf_F1": 0.4211, "n_leaf_sel": 15, "n_leaf_gt": 4, "ret_P": 0.0448, "sel_given_ret": 1.3333, "over_sel": 3.75, "why": {"explicit": 9, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 27, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "8": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2667, "gen_R": 1.0, "gen_F1": 0.4211, "missed": [], "extra": ["anthro", "big_eyes", "clothing", "feral", "floating", "floating_hands", "floating_limbs", "nose", "nude", "spots", "toony"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["anthro", "big_eyes", "clothing", "feral", "floating", "floating_hands", "floating_limbs", "nose", "nude", "red_nose", "smile", "solo", "spots", "tan_body", "toony"], "stage3_selected": ["big_eyes", "floating", "floating_hands", "floating_limbs", "nose", "red_nose", "smile", "spots", "tan_body", "toony", "white_background"], "stage3_selected_scores": {"smile": 0.6007, "white_background": 0.6129, "tan_body": 0.6622, "spots": 0.6267, "toony": 0.6012, "big_eyes": 0.6955, "red_nose": 0.7496, "floating": 0.6508, "nose": 0.8608, "floating_hands": 0.4342, "floating_limbs": 0.4364}, "stage3_selected_ranks": {"smile": 28, "white_background": 25, "tan_body": 14, "spots": 22, "toony": 27, "big_eyes": 6, "red_nose": 3, "floating": 16, "nose": 2, "floating_hands": 66, "floating_limbs": 65}, "stage3_selected_phrase_ranks": {"smile": 2, "white_background": 1, "tan_body": 6, "spots": 6, "toony": 1, "big_eyes": 1, "red_nose": 1, "floating": 1, "nose": 1, "floating_hands": 5, "floating_limbs": 4}, "extra_evidence": {"anthro": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6955}, "clothing": {"source": "probe"}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6508}, "floating_hands": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4342}, "floating_limbs": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4364}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8608}, "nude": {"source": "structural"}, "spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6267}, "toony": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6012}}, "structural": ["solo", "feral", "nude"], "probe": ["clothing", "simple_background", "anthro", "solo"], "t1": 4.99, "t2": 1.38, "t3": 3.29, "t3s": 5.69, "t3p": 12.8, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=68 entity=0 copyright_filtered=2 generic_char_to_general=0 unknown_type=5"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 104, "n_selected": 26, "n_implied": 7, "n_structural": 5, "n_probe": 3, "ret_R": 0.3636, "P": 0.5, "R": 0.5909, "F1": 0.5417, "leaf_P": 0.2105, "leaf_R": 0.3333, "leaf_F1": 0.2581, "n_leaf_sel": 19, "n_leaf_gt": 12, "ret_P": 0.0769, "sel_given_ret": 1.625, "over_sel": 1.18, "why": {"explicit": 13}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 63, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "42": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.5909, "gen_F1": 0.5417, "missed": ["chest_tuft", "countershading", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["countershade_body", "facial_tuft", "flexing_bicep", "fluffy_fur", "hand_on_arm", "hand_on_own_head", "looking_at_viewer", "playing", "striped_body", "striped_fur", "white_chest", "yellow_bottomwear", "yellow_clothing"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "facial_tuft", "felid", "flexing_bicep", "fluffy_fur", "fur", "hand_on_arm", "hand_on_head", "hand_on_own_head", "looking_at_viewer", "male", "mammal", "playing", "shorts", "solo", "striped_body", "striped_fur", "stripes", "white_chest", "yellow_bottomwear", "yellow_clothing"], "stage3_selected": ["blue_eyes", "countershade_body", "facial_tuft", "flexing_bicep", "fluffy_fur", "hand_on_arm", "hand_on_head", "hand_on_own_head", "playing", "shorts", "striped_fur", "white_chest", "yellow_bottomwear"], "stage3_selected_scores": {"blue_eyes": 0.5717, "shorts": 0.5785, "facial_tuft": 0.4821, "striped_fur": 0.6385, "hand_on_head": 0.5932, "hand_on_arm": 0.608, "playing": 0.3366, "flexing_bicep": 0.6624, "yellow_bottomwear": 0.652, "white_chest": 0.9198, "countershade_body": 0.8754, "fluffy_fur": 0.6693, "hand_on_own_head": 0.525}, "stage3_selected_ranks": {"blue_eyes": 45, "shorts": 44, "facial_tuft": 64, "striped_fur": 29, "hand_on_head": 40, "hand_on_arm": 34, "playing": 99, "flexing_bicep": 26, "yellow_bottomwear": 28, "white_chest": 2, "countershade_body": 3, "fluffy_fur": 24, "hand_on_own_head": 48}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "shorts": 1, "facial_tuft": 6, "striped_fur": 2, "hand_on_head": 2, "hand_on_arm": 1, "playing": 3, "flexing_bicep": 5, "yellow_bottomwear": 1, "white_chest": 1, "countershade_body": 1, "fluffy_fur": 1, "hand_on_own_head": 4}, "extra_evidence": {"countershade_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8754}, "facial_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4821}, "flexing_bicep": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6624}, "fluffy_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6693}, "hand_on_arm": {"source": "stage3", "why": "explicit", "retrieval_score": 0.608}, "hand_on_own_head": {"source": "stage3", "why": "explicit", "retrieval_score": 0.525}, "looking_at_viewer": {"source": "structural"}, "playing": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3366}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6385}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9198}, "yellow_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.652}, "yellow_clothing": {"source": "implied"}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "felid", "solo"], "t1": 1.93, "t2": 1.48, "t3": 34.51, "t3s": 1.74, "t3p": 5.82, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=102 entity=1 copyright_filtered=2 generic_char_to_general=0 unknown_type=2"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 118, "n_selected": 7, "n_implied": 0, "n_structural": 5, "n_probe": 4, "ret_R": 0.4615, "P": 0.1429, "R": 0.0769, "F1": 0.1, "leaf_P": 0.1429, "leaf_R": 0.1667, "leaf_F1": 0.1538, "n_leaf_sel": 7, "n_leaf_gt": 6, "ret_P": 0.0508, "sel_given_ret": 0.1667, "over_sel": 0.54, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 0, "calls_exhausted_retries": 2, "attempts_total": 6, "attempt_errors": 4, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"60": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}, "50": {"attempts": 3, "parse_ok": 2, "parse_fail": 0, "errors": 1}}, "attempt_failure_rate": 0.6666666666666666, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1429, "gen_R": 0.0769, "gen_F1": 0.1, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothing", "duo", "group", "solo", "taur"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothing", "duo", "group", "solo", "taur", "text"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"anthro": {"source": "probe"}, "clothing": {"source": "probe"}, "duo": {"source": "structural"}, "group": {"source": "structural"}, "solo": {"source": "structural"}, "taur": {"source": "structural"}}, "structural": ["solo", "duo", "group", "taur", "text"], "probe": ["clothing", "anthro", "text", "group"], "t1": 2.11, "t2": 1.85, "t3": 80.12, "t3s": 2.32, "t3p": 5.81, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=110 entity=4 copyright_filtered=4 generic_char_to_general=0 unknown_type=1", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"style_or_meta\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"style_or_meta\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"style_or_meta\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"style_or_meta\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"\": null}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 36, '': None}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"style_or_meta\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"style_or_meta\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"style_or_meta\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"style_or_meta\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_0: gave up after 3 attempts", "Stage3 general_chunk_1: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 6, \"why\": \"other\"}, {\"i\": 7, \"why\": \"other\"}, {\"i\": 8, \"why\": \"other\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"other\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 18, \"why\": \"other\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"other\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"explicit\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.36.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.36.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_chunk_1: gave up after 3 attempts"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 128, "n_selected": 28, "n_implied": 5, "n_structural": 3, "n_probe": 3, "ret_R": 0.6, "P": 0.3929, "R": 0.7333, "F1": 0.5116, "leaf_P": 0.25, "leaf_R": 0.4167, "leaf_F1": 0.3125, "n_leaf_sel": 20, "n_leaf_gt": 12, "ret_P": 0.0703, "sel_given_ret": 1.2222, "over_sel": 1.87, "why": {"explicit": 20}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 12, "kept_total": 53, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "11": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3929, "gen_R": 0.7333, "gen_F1": 0.5116, "missed": ["angry", "eyes_closed", "green_eyes", "sleeping"], "extra": ["annoyed_expression", "anthro", "bedroom", "clothing", "english_text", "expressions", "eyes", "humanoid", "lipstick", "lying_on_bed", "on_bed", "pajamas", "relaxed_expression", "resting", "sleepover", "watermark", "yellow_eyeshadow"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "bed", "bedroom", "blonde_hair", "blue_eyes", "clothing", "duo", "english_text", "expressions", "eyes", "eyeshadow", "furniture", "hair", "humanoid", "lipstick", "lying", "lying_on_bed", "makeup", "on_bed", "pajamas", "purple_hair", "relaxed_expression", "resting", "sleepover", "text", "watermark", "yellow_eyeshadow"], "stage3_selected": ["annoyed_expression", "bedroom", "blonde_hair", "blue_eyes", "english_text", "expressions", "eyes", "eyeshadow", "hair", "lipstick", "lying_on_bed", "makeup", "pajamas", "purple_hair", "relaxed_expression", "resting", "sleepover", "text", "watermark", "yellow_eyeshadow"], "stage3_selected_scores": {"hair": 0.6031, "text": 0.6007, "blue_eyes": 0.6014, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "eyeshadow": 0.4763, "watermark": 0.6042, "lipstick": 0.4874, "bedroom": 0.4901, "lying_on_bed": 0.4093, "pajamas": 0.3753, "resting": 0.5144, "annoyed_expression": 0.7251, "expressions": 0.5439, "sleepover": 0.5269, "yellow_eyeshadow": 0.4551, "relaxed_expression": 0.4534, "eyes": 0.8951, "english_text": 0.4189}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "blonde_hair": 10, "purple_hair": 14, "makeup": 11, "eyeshadow": 46, "watermark": 4, "lipstick": 40, "bedroom": 38, "lying_on_bed": 92, "pajamas": 112, "resting": 28, "annoyed_expression": 2, "expressions": 18, "sleepover": 25, "yellow_eyeshadow": 54, "relaxed_expression": 55, "eyes": 1, "english_text": 83}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "watermark": 1, "lipstick": 2, "bedroom": 1, "lying_on_bed": 4, "pajamas": 3, "resting": 1, "annoyed_expression": 1, "expressions": 3, "sleepover": 1, "yellow_eyeshadow": 6, "relaxed_expression": 6, "eyes": 1, "english_text": 4}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "clothing": {"source": "implied"}, "english_text": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4189}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5439}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8951}, "humanoid": {"source": "structural"}, "lipstick": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4874}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4093}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3753}, "relaxed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4534}, "resting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5144}, "sleepover": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5269}, "watermark": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6042}, "yellow_eyeshadow": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4551}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.43, "t2": 2.06, "t3": 16.44, "t3s": 1.29, "t3p": 5.82, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=131 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 74, "n_selected": 36, "n_implied": 11, "n_structural": 3, "n_probe": 3, "ret_R": 0.2727, "P": 0.4167, "R": 0.6818, "F1": 0.5172, "leaf_P": 0.2222, "leaf_R": 0.3077, "leaf_F1": 0.2581, "n_leaf_sel": 18, "n_leaf_gt": 13, "ret_P": 0.0811, "sel_given_ret": 2.5, "over_sel": 1.64, "why": {"explicit": 21}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 41, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "17": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4167, "gen_R": 0.6818, "gen_F1": 0.5172, "missed": ["fingers", "fur", "holding_musical_instrument", "holding_object", "music", "spade_tail", "tail"], "extra": ["3_claws", "acoustic_guitar", "blonde_hair", "canis", "crosslegged_pose", "dire_wolf", "electric_guitar", "finger_claws", "flowing_hair", "holding_hair", "leggings", "legwear", "mexican_wolf", "pastel_background", "playing_guitar", "playing_music", "toe_claws", "torn_leggings", "torn_legwear", "touching_hair", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["3_claws", "acoustic_guitar", "anthro", "bass_guitar", "blonde_hair", "canid", "canine", "canis", "claws", "clothed", "clothing", "crosslegged_pose", "dire_wolf", "electric_guitar", "finger_claws", "flowing_hair", "guitar", "hair", "holding_hair", "leggings", "legwear", "mammal", "mexican_wolf", "musical_instrument", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "solo", "string_instrument", "toe_claws", "torn_clothing", "torn_leggings", "torn_legwear", "touching_hair", "wolf"], "stage3_selected": ["3_claws", "acoustic_guitar", "bass_guitar", "blonde_hair", "claws", "crosslegged_pose", "dire_wolf", "electric_guitar", "finger_claws", "flowing_hair", "guitar", "holding_hair", "mexican_wolf", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "toe_claws", "torn_clothing", "torn_leggings", "wolf"], "stage3_selected_scores": {"claws": 0.5504, "wolf": 0.5691, "blonde_hair": 0.3645, "toe_claws": 0.4749, "torn_clothing": 0.3951, "finger_claws": 0.422, "plucked_string_instrument": 0.8817, "guitar": 0.9788, "playing_music": 0.8891, "playing_guitar": 0.9494, "3_claws": 0.4218, "dire_wolf": 0.4342, "electric_guitar": 0.8829, "bass_guitar": 0.9286, "flowing_hair": 0.5466, "crosslegged_pose": 0.445, "mexican_wolf": 0.4285, "torn_leggings": 0.3987, "holding_hair": 0.3725, "acoustic_guitar": 0.8816, "pastel_background": 0.5453}, "stage3_selected_ranks": {"claws": 13, "wolf": 8, "blonde_hair": 70, "toe_claws": 30, "torn_clothing": 62, "finger_claws": 52, "plucked_string_instrument": 6, "guitar": 1, "playing_music": 4, "playing_guitar": 2, "3_claws": 53, "dire_wolf": 48, "electric_guitar": 5, "bass_guitar": 3, "flowing_hair": 15, "crosslegged_pose": 42, "mexican_wolf": 50, "torn_leggings": 60, "holding_hair": 67, "acoustic_guitar": 7, "pastel_background": 16}, "stage3_selected_phrase_ranks": {"claws": 1, "wolf": 1, "blonde_hair": 6, "toe_claws": 2, "torn_clothing": 6, "finger_claws": 5, "plucked_string_instrument": 5, "guitar": 1, "playing_music": 3, "playing_guitar": 1, "3_claws": 6, "dire_wolf": 5, "electric_guitar": 4, "bass_guitar": 2, "flowing_hair": 1, "crosslegged_pose": 6, "mexican_wolf": 6, "torn_leggings": 5, "holding_hair": 4, "acoustic_guitar": 5, "pastel_background": 1}, "extra_evidence": {"3_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4218}, "acoustic_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8816}, "blonde_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3645}, "canis": {"source": "implied"}, "crosslegged_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.445}, "dire_wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4342}, "electric_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8829}, "finger_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.422}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5466}, "holding_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3725}, "leggings": {"source": "implied"}, "legwear": {"source": "implied"}, "mexican_wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4285}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5453}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9494}, "playing_music": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8891}, "toe_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4749}, "torn_leggings": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3987}, "torn_legwear": {"source": "implied"}, "touching_hair": {"source": "implied"}, "wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5691}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.51, "t2": 1.43, "t3": 7.44, "t3s": 1.39, "t3p": 2.51, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=77 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 91, "n_selected": 34, "n_implied": 17, "n_structural": 4, "n_probe": 3, "ret_R": 0.56, "P": 0.4706, "R": 0.64, "F1": 0.5424, "leaf_P": 0.3333, "leaf_R": 0.3333, "leaf_F1": 0.3333, "n_leaf_sel": 15, "n_leaf_gt": 15, "ret_P": 0.1538, "sel_given_ret": 1.1429, "over_sel": 1.36, "why": {"explicit": 11, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 56, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "31": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4706, "gen_R": 0.64, "gen_F1": 0.5424, "missed": ["claws", "crossed_arms", "facial_markings", "fur", "head_markings", "looking_at_another", "markings", "overalls", "standing"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_topwear", "cross_fox", "dress_shirt", "grey_clothing", "grey_shirt", "grey_topwear", "looking_at_viewer", "mouth_closed", "open_mouth", "red_fox", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_clothing", "blue_overalls", "blue_topwear", "bottomwear", "canid", "canine", "clothed", "clothing", "cross_fox", "dress_shirt", "duo", "fox", "grey_background", "grey_clothing", "grey_shirt", "grey_topwear", "lagomorph", "leporid", "looking_at_viewer", "mammal", "mouth_closed", "open_mouth", "pants", "rabbit", "red_fox", "shirt", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_bottomwear", "black_pants", "blue_overalls", "blue_topwear", "cross_fox", "dress_shirt", "fox", "grey_background", "grey_shirt", "mouth_closed", "open_mouth", "rabbit", "white_shirt"], "stage3_selected_scores": {"open_mouth": 0.633, "fox": 0.638, "rabbit": 0.6511, "grey_background": 0.6784, "black_bottomwear": 0.7384, "blue_topwear": 0.666, "white_shirt": 0.8197, "dress_shirt": 0.6688, "black_pants": 0.833, "grey_shirt": 0.6923, "blue_overalls": 0.9203, "mouth_closed": 0.5678, "cross_fox": 0.4688}, "stage3_selected_ranks": {"open_mouth": 34, "fox": 32, "rabbit": 31, "grey_background": 24, "black_bottomwear": 15, "blue_topwear": 27, "white_shirt": 4, "dress_shirt": 26, "black_pants": 3, "grey_shirt": 21, "blue_overalls": 1, "mouth_closed": 58, "cross_fox": 87}, "stage3_selected_phrase_ranks": {"open_mouth": 1, "fox": 1, "rabbit": 1, "grey_background": 1, "black_bottomwear": 5, "blue_topwear": 4, "white_shirt": 1, "dress_shirt": 5, "black_pants": 1, "grey_shirt": 4, "blue_overalls": 1, "mouth_closed": 1, "cross_fox": 4}, "extra_evidence": {"black_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7384}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.833}, "blue_clothing": {"source": "implied"}, "blue_overalls": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9203}, "blue_topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.666}, "cross_fox": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4688}, "dress_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6688}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6923}, "grey_topwear": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}, "mouth_closed": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5678}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.633}, "red_fox": {"source": "implied"}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8197}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 1.99, "t2": 1.49, "t3": 17.7, "t3s": 1.15, "t3p": 1.25, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=91 entity=2 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 127, "n_selected": 58, "n_implied": 18, "n_structural": 5, "n_probe": 3, "ret_R": 0.5455, "P": 0.1379, "R": 0.7273, "F1": 0.2319, "leaf_P": 0.0541, "leaf_R": 0.2857, "leaf_F1": 0.0909, "n_leaf_sel": 37, "n_leaf_gt": 7, "ret_P": 0.0472, "sel_given_ret": 1.3333, "over_sel": 5.27, "why": {"explicit": 31, "strong_implied": 4}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 3, "calls_with_selection": 3, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 61, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 2, "parse_fail": 0, "errors": 0}, "12": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1379, "gen_R": 0.7273, "gen_F1": 0.2319, "missed": ["blue_eyes", "blue_nose", "purple_body"], "extra": ["actual_fur", "angry_expression", "animal_humanoid", "anthro", "belly", "big_eyes", "blue_fingers", "blue_inner_ear_fluff", "blue_toes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "cute_expression", "expressions", "facial_stripes", "fighting_pose", "fluffy_fur", "fox_humanoid", "half_body", "humanoid", "inner_ear_fluff", "jumper", "jumping", "light_tail", "looking_at_viewer", "male", "mammal_humanoid", "mouth_closed", "multi_tone_fur", "open_smile", "pink_ears", "pink_legs", "pink_stripes", "pink_tongue", "pose", "purple_belly", "purple_face", "scales", "scaly_tail", "smile", "stripes", "tail", "tail_tuft", "tan_nose", "teal_nose", "tongue", "tuft", "wolf_humanoid"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["actual_fur", "angry_expression", "animal_humanoid", "anthro", "belly", "big_eyes", "blue_fingers", "blue_inner_ear_fluff", "blue_toes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "cute_expression", "expressions", "facial_stripes", "fighting_pose", "fluffy_fur", "fox_humanoid", "fur", "half_body", "humanoid", "inner_ear_fluff", "jumper", "jumping", "light_tail", "looking_at_viewer", "male", "mammal", "mammal_humanoid", "mouth_closed", "multi_tone_fur", "open_mouth", "open_smile", "pink_ears", "pink_legs", "pink_stripes", "pink_tongue", "pose", "purple_belly", "purple_face", "scales", "scaly_tail", "smile", "solo", "stripes", "tail", "tail_tuft", "tan_nose", "teal_nose", "tongue", "tuft", "white_body", "white_fur", "wolf_humanoid"], "stage3_selected": ["actual_fur", "angry_expression", "big_eyes", "blue_fingers", "blue_inner_ear_fluff", "blue_toes", "canine_humanoid", "curved_tail", "cute_expression", "expressions", "facial_stripes", "fighting_pose", "fluffy_fur", "fox_humanoid", "half_body", "jumper", "jumping", "light_tail", "mouth_closed", "multi_tone_fur", "open_mouth", "open_smile", "pink_ears", "pink_legs", "pink_stripes", "pink_tongue", "purple_belly", "purple_face", "scaly_tail", "simple_background", "tail_tuft", "tan_nose", "teal_nose", "white_fur", "wolf_humanoid"], "stage3_selected_scores": {"simple_background": 0.5948, "white_fur": 0.5995, "tail_tuft": 0.4995, "pink_tongue": 0.4215, "canine_humanoid": 0.9003, "fox_humanoid": 0.8204, "big_eyes": 0.4207, "mouth_closed": 0.5218, "wolf_humanoid": 0.819, "cute_expression": 0.4486, "pink_ears": 0.5255, "fighting_pose": 0.4594, "multi_tone_fur": 0.5135, "tan_nose": 0.473, "expressions": 0.4957, "light_tail": 0.5671, "pink_stripes": 0.682, "blue_inner_ear_fluff": 0.4727, "purple_face": 0.5577, "fluffy_fur": 0.5593, "angry_expression": 0.4879, "purple_belly": 0.5454, "scaly_tail": 0.4822, "blue_fingers": 0.5077, "blue_toes": 0.5148, "facial_stripes": 0.5968, "half_body": 0.4115, "teal_nose": 0.4695, "jumper": 0.4077, "pink_legs": 0.5285, "actual_fur": 0.4563, "open_mouth": 0.6008, "open_smile": 0.4868, "jumping": 0.6014, "curved_tail": 0.637}, "stage3_selected_ranks": {"simple_background": 31, "white_fur": 26, "tail_tuft": 77, "pink_tongue": 111, "canine_humanoid": 1, "fox_humanoid": 3, "big_eyes": 112, "mouth_closed": 66, "wolf_humanoid": 4, "cute_expression": 106, "pink_ears": 62, "fighting_pose": 101, "multi_tone_fur": 70, "tan_nose": 97, "expressions": 79, "light_tail": 41, "pink_stripes": 9, "blue_inner_ear_fluff": 98, "purple_face": 48, "fluffy_fur": 46, "angry_expression": 86, "purple_belly": 50, "scaly_tail": 89, "blue_fingers": 73, "blue_toes": 68, "facial_stripes": 29, "half_body": 115, "teal_nose": 99, "jumper": 116, "pink_legs": 60, "actual_fur": 104, "open_mouth": 24, "open_smile": 87, "jumping": 23, "curved_tail": 12}, "stage3_selected_phrase_ranks": {"simple_background": 1, "white_fur": 1, "tail_tuft": 4, "pink_tongue": 3, "canine_humanoid": 1, "fox_humanoid": 3, "big_eyes": 6, "mouth_closed": 3, "wolf_humanoid": 4, "cute_expression": 6, "pink_ears": 4, "fighting_pose": 5, "multi_tone_fur": 5, "tan_nose": 4, "expressions": 3, "light_tail": 4, "pink_stripes": 1, "blue_inner_ear_fluff": 4, "purple_face": 3, "fluffy_fur": 2, "angry_expression": 4, "purple_belly": 4, "scaly_tail": 5, "blue_fingers": 5, "blue_toes": 4, "facial_stripes": 3, "half_body": 4, "teal_nose": 5, "jumper": 3, "pink_legs": 3, "actual_fur": 3, "open_mouth": 1, "open_smile": 2, "jumping": 1, "curved_tail": 1}, "extra_evidence": {"actual_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4563}, "angry_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4879}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "belly": {"source": "implied"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4207}, "blue_fingers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5077}, "blue_inner_ear_fluff": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4727}, "blue_toes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5148}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9003}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.637}, "cute_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4486}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4957}, "facial_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5968}, "fighting_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4594}, "fluffy_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5593}, "fox_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8204}, "half_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4115}, "humanoid": {"source": "implied"}, "inner_ear_fluff": {"source": "implied"}, "jumper": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4077}, "jumping": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6014}, "light_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5671}, "looking_at_viewer": {"source": "structural"}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "mouth_closed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5218}, "multi_tone_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5135}, "open_smile": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4868}, "pink_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5255}, "pink_legs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5285}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.682}, "pink_tongue": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4215}, "pose": {"source": "implied"}, "purple_belly": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5454}, "purple_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5577}, "scales": {"source": "implied"}, "scaly_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4822}, "smile": {"source": "implied"}, "stripes": {"source": "implied"}, "tail": {"source": "implied"}, "tail_tuft": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4995}, "tan_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.473}, "teal_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4695}, "tongue": {"source": "implied"}, "tuft": {"source": "implied"}, "wolf_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.819}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 4.15, "t2": 2.03, "t3": 36.05, "t3s": 1.41, "t3p": 5.23, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=132 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
data/eval_results/latency_k1_seed42.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T08:44:10.811021", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": false, "seed": 42, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 16, "n_selected": 25, "n_implied": 7, "n_structural": 4, "n_probe": 5, "ret_R": 0.1667, "P": 0.4, "R": 0.8333, "F1": 0.5405, "leaf_P": 0.3571, "leaf_R": 0.5556, "leaf_F1": 0.4348, "n_leaf_sel": 14, "n_leaf_gt": 9, "ret_P": 0.125, "sel_given_ret": 5.0, "over_sel": 2.08, "why": {"explicit": 11, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4, "gen_R": 0.8333, "gen_F1": 0.5405, "missed": ["alpha_channel", "fingers"], "extra": ["black_body", "black_fur", "business_attire", "formal", "hair_bun", "holding_mug", "holding_object", "mug", "necktie", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "black_body", "black_fur", "business_attire", "clothed", "clothing", "felid", "feline", "formal", "fur", "hair", "hair_bun", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "solo", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "business_attire", "feline", "formal", "fur", "hair_bun", "holding_mug", "invalid_background", "necktie", "simple_background", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"fur": 0.7146, "simple_background": 0.6978, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8403, "hair_bun": 0.6926, "holding_mug": 0.916, "formal": 0.5993, "business_attire": 0.5558, "teal_shirt": 0.7474, "white_necktie": 0.6418, "feline": 0.7062, "invalid_background": 0.6495}, "stage3_selected_ranks": {"fur": 9, "simple_background": 11, "black_fur": 8, "necktie": 7, "vest": 3, "hair_bun": 12, "holding_mug": 1, "formal": 16, "business_attire": 18, "teal_shirt": 6, "white_necktie": 14, "feline": 10, "invalid_background": 13}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "black_fur": 1, "necktie": 1, "vest": 1, "hair_bun": 1, "holding_mug": 1, "formal": 1, "business_attire": 1, "teal_shirt": 1, "white_necktie": 1, "feline": 1, "invalid_background": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7183}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5558}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5993}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6926}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "implied"}, "necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7314}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8403}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 4.11, "t2": 3.46, "t3": 3.15, "t3s": 5.64, "t3p": 8.43, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 18, "n_selected": 24, "n_implied": 5, "n_structural": 7, "n_probe": 5, "ret_R": 0.5, "P": 0.4583, "R": 0.7857, "F1": 0.5789, "leaf_P": 0.3125, "leaf_R": 0.5, "leaf_F1": 0.3846, "n_leaf_sel": 16, "n_leaf_gt": 10, "ret_P": 0.3889, "sel_given_ret": 1.5714, "over_sel": 1.71, "why": {"explicit": 13}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4583, "gen_R": 0.7857, "gen_F1": 0.5789, "missed": ["fur", "hair", "human"], "extra": ["anthro", "bottomwear", "cheeky", "duo", "feral", "grin", "laugh", "loincloth", "raised_arms", "smile", "topless", "trio", "wide_grin"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "bottomwear", "cheeky", "clothed", "clothing", "dancing", "duo", "feral", "grin", "group", "haplorhine", "laugh", "loincloth", "looking_at_viewer", "male", "mammal", "primate", "raised_arms", "smile", "topless", "trio", "wide_grin"], "stage3_selected": ["ape", "bear", "cheeky", "dancing", "grin", "laugh", "loincloth", "looking_at_viewer", "male", "primate", "raised_arms", "simple_background", "wide_grin"], "stage3_selected_scores": {"male": 0.5604, "simple_background": 0.5491, "looking_at_viewer": 0.5475, "bear": 0.5735, "grin": 0.5653, "primate": 0.8905, "loincloth": 0.5685, "dancing": 0.5568, "laugh": 0.5259, "ape": 0.9767, "raised_arms": 0.5445, "cheeky": 0.3903, "wide_grin": 0.5267}, "stage3_selected_ranks": {"male": 6, "simple_background": 8, "looking_at_viewer": 9, "bear": 3, "grin": 5, "primate": 2, "loincloth": 4, "dancing": 7, "laugh": 13, "ape": 1, "raised_arms": 10, "cheeky": 20, "wide_grin": 12}, "stage3_selected_phrase_ranks": {"male": 1, "simple_background": 1, "looking_at_viewer": 1, "bear": 1, "grin": 1, "primate": 1, "loincloth": 1, "dancing": 1, "laugh": 1, "ape": 1, "raised_arms": 1, "cheeky": 1, "wide_grin": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "bottomwear": {"source": "implied"}, "cheeky": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3903}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "grin": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5653}, "laugh": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5259}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5685}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5445}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5267}}, "structural": ["trio", "anthro", "feral", "male", "clothed", "topless", "looking_at_viewer"], "probe": ["anthro", "duo", "group", "bear", "simple_background"], "t1": 3.2, "t2": 4.2, "t3": 9.91, "t3s": 4.45, "t3p": 5.29, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 15, "n_selected": 23, "n_implied": 5, "n_structural": 4, "n_probe": 5, "ret_R": 0.3571, "P": 0.6087, "R": 1.0, "F1": 0.7568, "leaf_P": 0.5294, "leaf_R": 1.0, "leaf_F1": 0.6923, "n_leaf_sel": 17, "n_leaf_gt": 9, "ret_P": 0.3333, "sel_given_ret": 2.8, "over_sel": 1.64, "why": {"explicit": 5, "strong_implied": 6}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6087, "gen_R": 1.0, "gen_F1": 0.7568, "missed": [], "extra": ["<3", "coat", "eyes", "intimate", "looking_at_viewer", "relationship", "round_eyes", "setting", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "eyes", "intimate", "lagomorph", "leporid", "looking_at_viewer", "mammal", "plushie", "rabbit", "relationship", "romantic", "romantic_couple", "round_eyes", "setting", "teal_eyes", "topwear"], "stage3_selected": ["blue_eyes", "coat", "eyes", "intimate", "plushie", "rabbit", "relationship", "romantic_couple", "round_eyes", "setting", "teal_eyes"], "stage3_selected_scores": {"blue_eyes": 0.6105, "coat": 0.6317, "plushie": 0.6568, "teal_eyes": 0.6345, "relationship": 0.6088, "rabbit": 0.5844, "romantic_couple": 0.5619, "intimate": 0.4706, "round_eyes": 0.4982, "setting": 0.5515, "eyes": 0.913}, "stage3_selected_ranks": {"blue_eyes": 7, "coat": 5, "plushie": 3, "teal_eyes": 4, "relationship": 8, "rabbit": 9, "romantic_couple": 11, "intimate": 16, "round_eyes": 15, "setting": 12, "eyes": 1}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "coat": 1, "plushie": 1, "teal_eyes": 1, "relationship": 1, "rabbit": 1, "romantic_couple": 1, "intimate": 1, "round_eyes": 1, "setting": 1, "eyes": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6317}, "eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.913}, "intimate": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4706}, "looking_at_viewer": {"source": "structural"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6088}, "round_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4982}, "setting": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5515}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 3.09, "t2": 4.27, "t3": 2.74, "t3s": 4.59, "t3p": 7.55, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 12, "n_selected": 13, "n_implied": 1, "n_structural": 3, "n_probe": 3, "ret_R": 0.75, "P": 0.3077, "R": 1.0, "F1": 0.4706, "leaf_P": 0.3333, "leaf_R": 1.0, "leaf_F1": 0.5, "n_leaf_sel": 12, "n_leaf_gt": 4, "ret_P": 0.25, "sel_given_ret": 1.3333, "over_sel": 3.25, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"15": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3077, "gen_R": 1.0, "gen_F1": 0.4706, "missed": [], "extra": ["anthro", "clothed", "clothing", "eyes", "floating", "nose", "spots", "toony", "unknown_species"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["anthro", "clothed", "clothing", "eyes", "floating", "nose", "red_nose", "smile", "solo", "spots", "tan_body", "toony", "unknown_species"], "stage3_selected": ["eyes", "floating", "invalid_tag", "nose", "red_nose", "smile", "spots", "tan_body", "toony", "unknown_species", "white_background"], "stage3_selected_scores": {"smile": 0.5956, "white_background": 0.6072, "tan_body": 0.6582, "spots": 0.6224, "toony": 0.5172, "unknown_species": 0.5802, "red_nose": 0.7475, "floating": 0.6454, "invalid_tag": 0.5285, "nose": 0.8611, "eyes": 0.9242}, "stage3_selected_ranks": {"smile": 10, "white_background": 9, "tan_body": 5, "spots": 8, "toony": 14, "unknown_species": 11, "red_nose": 3, "floating": 6, "invalid_tag": 13, "nose": 2, "eyes": 1}, "stage3_selected_phrase_ranks": {"smile": 1, "white_background": 1, "tan_body": 1, "spots": 1, "toony": 1, "unknown_species": 1, "red_nose": 1, "floating": 1, "invalid_tag": 1, "nose": 1, "eyes": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9242}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6454}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8611}, "spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6224}, "toony": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5172}, "unknown_species": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5802}}, "structural": ["solo", "anthro", "clothed"], "probe": ["simple_background", "anthro", "solo"], "t1": 2.91, "t2": 4.8, "t3": 6.45, "t3s": 4.59, "t3p": 5.4, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=15 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 18, "n_selected": 22, "n_implied": 6, "n_structural": 5, "n_probe": 3, "ret_R": 0.1818, "P": 0.6818, "R": 0.6818, "F1": 0.6818, "leaf_P": 0.3846, "leaf_R": 0.4167, "leaf_F1": 0.4, "n_leaf_sel": 13, "n_leaf_gt": 12, "ret_P": 0.2222, "sel_given_ret": 3.75, "over_sel": 1.0, "why": {"explicit": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6818, "gen_R": 0.6818, "gen_F1": 0.6818, "missed": ["chest_tuft", "countershading", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "tuft"], "extra": ["countershade_body", "playful", "pose", "raised_hand", "striped_body", "striped_fur", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "felid", "fur", "male", "mammal", "pantherine", "playful", "pose", "raised_hand", "shorts", "solo", "striped_body", "striped_fur", "stripes", "tiger", "topless", "white_chest"], "stage3_selected": ["blue_eyes", "countershade_body", "fur", "playful", "pose", "raised_hand", "shorts", "striped_fur", "tiger", "white_chest"], "stage3_selected_scores": {"fur": 0.5959, "blue_eyes": 0.5842, "pose": 0.6367, "shorts": 0.5939, "tiger": 0.6053, "striped_fur": 0.655, "raised_hand": 0.7024, "playful": 0.4435, "white_chest": 0.9243, "countershade_body": 0.8719}, "stage3_selected_ranks": {"fur": 12, "blue_eyes": 14, "pose": 10, "shorts": 13, "tiger": 11, "striped_fur": 9, "raised_hand": 6, "playful": 19, "white_chest": 2, "countershade_body": 3}, "stage3_selected_phrase_ranks": {"fur": 1, "blue_eyes": 1, "pose": 1, "shorts": 1, "tiger": 1, "striped_fur": 1, "raised_hand": 1, "playful": 1, "white_chest": 1, "countershade_body": 1}, "extra_evidence": {"countershade_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8719}, "playful": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4435}, "pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6367}, "raised_hand": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7024}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.655}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9243}}, "structural": ["solo", "anthro", "male", "clothed", "topless"], "probe": ["anthro", "felid", "solo"], "t1": 5.38, "t2": 1.5, "t3": 1.51, "t3s": 0.61, "t3p": 6.85, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 19, "n_selected": 9, "n_implied": 0, "n_structural": 5, "n_probe": 4, "ret_R": 0.2308, "P": 0.1111, "R": 0.0769, "F1": 0.0909, "leaf_P": 0.125, "leaf_R": 0.1667, "leaf_F1": 0.1429, "n_leaf_sel": 8, "n_leaf_gt": 6, "ret_P": 0.1579, "sel_given_ret": 0.3333, "over_sel": 0.69, "why": {"explicit": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 8, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1111, "gen_R": 0.0769, "gen_F1": 0.0909, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothed", "clothing", "group", "intersex", "light", "speech_bubble", "taur"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothed", "clothing", "group", "intersex", "light", "speech_bubble", "taur", "text"], "stage3_selected": ["light", "speech_bubble"], "stage3_selected_scores": {"speech_bubble": 0.5783, "light": 0.5852}, "stage3_selected_ranks": {"speech_bubble": 9, "light": 6}, "stage3_selected_phrase_ranks": {"speech_bubble": 1, "light": 1}, "extra_evidence": {"anthro": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "group": {"source": "structural"}, "intersex": {"source": "structural"}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5852}, "speech_bubble": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5783}, "taur": {"source": "structural"}}, "structural": ["group", "taur", "intersex", "clothed", "text"], "probe": ["clothing", "anthro", "text", "group"], "t1": 3.11, "t2": 1.63, "t3": 1.05, "t3s": 1.38, "t3p": 4.03, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 21, "n_selected": 16, "n_implied": 5, "n_structural": 3, "n_probe": 3, "ret_R": 0.5333, "P": 0.6875, "R": 0.7333, "F1": 0.7097, "leaf_P": 0.5455, "leaf_R": 0.5, "leaf_F1": 0.5217, "n_leaf_sel": 11, "n_leaf_gt": 12, "ret_P": 0.381, "sel_given_ret": 1.375, "over_sel": 1.07, "why": {"explicit": 8}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6875, "gen_R": 0.7333, "gen_F1": 0.7097, "missed": ["angry", "eyes_closed", "eyeshadow", "sleeping"], "extra": ["annoyed_expression", "anthro", "humanoid", "lying_on_bed", "on_bed"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "bed", "blonde_hair", "blue_eyes", "duo", "furniture", "green_eyes", "hair", "humanoid", "lying", "lying_on_bed", "makeup", "on_bed", "purple_hair", "text"], "stage3_selected": ["annoyed_expression", "blonde_hair", "blue_eyes", "green_eyes", "lying_on_bed", "makeup", "purple_hair", "text"], "stage3_selected_scores": {"text": 0.6007, "blue_eyes": 0.6014, "green_eyes": 0.5989, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "lying_on_bed": 0.4241, "annoyed_expression": 0.7251}, "stage3_selected_ranks": {"text": 8, "blue_eyes": 7, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "lying_on_bed": 21, "annoyed_expression": 2}, "stage3_selected_phrase_ranks": {"text": 1, "blue_eyes": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "lying_on_bed": 1, "annoyed_expression": 1}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "humanoid": {"source": "structural"}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4241}, "on_bed": {"source": "implied"}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 3.99, "t2": 1.96, "t3": 4.0, "t3s": 2.23, "t3p": 1.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 16, "n_selected": 17, "n_implied": 6, "n_structural": 3, "n_probe": 3, "ret_R": 0.1818, "P": 0.6471, "R": 0.5, "F1": 0.5641, "leaf_P": 0.5, "leaf_R": 0.3846, "leaf_F1": 0.4348, "n_leaf_sel": 10, "n_leaf_gt": 13, "ret_P": 0.25, "sel_given_ret": 2.75, "over_sel": 0.77, "why": {"explicit": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6471, "gen_R": 0.5, "gen_F1": 0.5641, "missed": ["bass_guitar", "fingers", "fur", "guitar", "holding_musical_instrument", "holding_object", "music", "musical_instrument", "plucked_string_instrument", "string_instrument", "torn_clothing"], "extra": ["bass_(disambiguation)", "canis", "flowing_hair", "pastel_background", "playing_bass", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bass_(disambiguation)", "canid", "canine", "canis", "claws", "clothed", "clothing", "flowing_hair", "hair", "mammal", "pastel_background", "playing_bass", "solo", "spade_tail", "tail", "wolf"], "stage3_selected": ["bass_(disambiguation)", "claws", "flowing_hair", "pastel_background", "playing_bass", "spade_tail", "wolf"], "stage3_selected_scores": {"claws": 0.6305, "wolf": 0.5983, "spade_tail": 0.872, "flowing_hair": 0.7019, "bass_(disambiguation)": 0.5206, "playing_bass": 0.5052, "pastel_background": 0.6263}, "stage3_selected_ranks": {"claws": 5, "wolf": 9, "spade_tail": 1, "flowing_hair": 2, "bass_(disambiguation)": 12, "playing_bass": 13, "pastel_background": 6}, "stage3_selected_phrase_ranks": {"claws": 1, "wolf": 1, "spade_tail": 1, "flowing_hair": 1, "bass_(disambiguation)": 1, "playing_bass": 1, "pastel_background": 1}, "extra_evidence": {"bass_(disambiguation)": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5206}, "canis": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7019}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6263}, "playing_bass": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5052}, "wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5983}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 1.3, "t2": 1.38, "t3": 2.28, "t3s": 1.42, "t3p": 3.55, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=3"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 17, "n_selected": 29, "n_implied": 13, "n_structural": 4, "n_probe": 3, "ret_R": 0.44, "P": 0.6897, "R": 0.8, "F1": 0.7407, "leaf_P": 0.6667, "leaf_R": 0.6667, "leaf_F1": 0.6667, "n_leaf_sel": 15, "n_leaf_gt": 15, "ret_P": 0.6471, "sel_given_ret": 1.8182, "over_sel": 1.16, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6897, "gen_R": 0.8, "gen_F1": 0.7407, "missed": ["grey_background", "lagomorph", "leporid", "looking_at_another", "rabbit"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "looking_at_viewer", "open_mouth", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "head_markings", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "shirt", "standing", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "blue_overalls", "claws", "crossed_arms", "facial_markings", "fox", "fur", "open_mouth", "overalls", "shirt", "standing", "white_shirt"], "stage3_selected_scores": {"fur": 0.647, "open_mouth": 0.6268, "claws": 0.5818, "standing": 0.681, "fox": 0.634, "shirt": 0.7434, "facial_markings": 0.6877, "crossed_arms": 0.7223, "white_shirt": 0.8155, "overalls": 0.8759, "black_pants": 0.8282, "blue_overalls": 0.9189}, "stage3_selected_ranks": {"fur": 12, "open_mouth": 14, "claws": 17, "standing": 9, "fox": 13, "shirt": 6, "facial_markings": 8, "crossed_arms": 7, "white_shirt": 4, "overalls": 2, "black_pants": 3, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "standing": 1, "fox": 1, "shirt": 1, "facial_markings": 1, "crossed_arms": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1, "blue_overalls": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8282}, "blue_overalls": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9189}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6268}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8155}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 1.05, "t2": 1.43, "t3": 2.54, "t3s": 0.62, "t3p": 2.58, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 21, "n_selected": 21, "n_implied": 9, "n_structural": 4, "n_probe": 3, "ret_R": 0.5455, "P": 0.381, "R": 0.7273, "F1": 0.5, "leaf_P": 0.3636, "leaf_R": 0.5714, "leaf_F1": 0.4444, "n_leaf_sel": 11, "n_leaf_gt": 7, "ret_P": 0.2857, "sel_given_ret": 1.3333, "over_sel": 1.91, "why": {"explicit": 6, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 16, "attempts_by_n_local": {"24": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.381, "gen_R": 0.7273, "gen_F1": 0.5, "missed": ["open_mouth", "white_body", "white_fur"], "extra": ["action_pose", "ambiguous_gender", "animal_humanoid", "anthro", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "humanoid", "jumping", "mammal_humanoid", "pose", "topless"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["action_pose", "ambiguous_gender", "animal_humanoid", "anthro", "blue_eyes", "blue_nose", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "fur", "humanoid", "jumping", "mammal", "mammal_humanoid", "pose", "purple_body", "solo", "topless"], "stage3_selected": ["action_pose", "blue_eyes", "blue_nose", "canine_humanoid", "fur", "jumping", "purple_body", "simple_background"], "stage3_selected_scores": {"fur": 0.5679, "simple_background": 0.5795, "blue_eyes": 0.5832, "purple_body": 0.5484, "canine_humanoid": 0.9129, "blue_nose": 0.5927, "action_pose": 0.5954, "jumping": 0.5819}, "stage3_selected_ranks": {"fur": 16, "simple_background": 13, "blue_eyes": 11, "purple_body": 18, "canine_humanoid": 1, "blue_nose": 8, "action_pose": 7, "jumping": 12}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "blue_eyes": 1, "purple_body": 1, "canine_humanoid": 1, "blue_nose": 1, "action_pose": 1, "jumping": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5954}, "ambiguous_gender": {"source": "structural"}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9129}, "clothed": {"source": "implied"}, "clothing": {"source": "implied"}, "humanoid": {"source": "implied"}, "jumping": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5819}, "mammal_humanoid": {"source": "implied"}, "pose": {"source": "implied"}, "topless": {"source": "structural"}}, "structural": ["solo", "anthro", "ambiguous_gender", "topless"], "probe": ["anthro", "canid", "solo"], "t1": 1.41, "t2": 1.68, "t3": 5.47, "t3s": 1.42, "t3p": 4.61, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=24 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
data/eval_results/latency_k1_seed43.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T08:44:09.015467", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": false, "seed": 43, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 15, "n_selected": 24, "n_implied": 6, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.375, "R": 0.75, "F1": 0.5, "leaf_P": 0.3571, "leaf_R": 0.5556, "leaf_F1": 0.4348, "n_leaf_sel": 14, "n_leaf_gt": 9, "ret_P": 0.2, "sel_given_ret": 3.0, "over_sel": 2.0, "why": {"explicit": 11, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.375, "gen_R": 0.75, "gen_F1": 0.5, "missed": ["alpha_channel", "fingers", "hair"], "extra": ["black_body", "black_fur", "business_attire", "formal", "holding_mug", "holding_object", "mug", "necktie", "shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "black_body", "black_fur", "business_attire", "clothed", "clothing", "felid", "feline", "formal", "fur", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "shirt", "solo", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "business_attire", "feline", "formal", "holding_mug", "mug", "necktie", "shirt", "simple_background", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"simple_background": 0.6978, "shirt": 0.7998, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8403, "mug": 0.8841, "holding_mug": 0.916, "formal": 0.5993, "business_attire": 0.5558, "teal_shirt": 0.7474, "white_necktie": 0.6418, "feline": 0.7062}, "stage3_selected_ranks": {"simple_background": 11, "shirt": 5, "black_fur": 8, "necktie": 7, "vest": 3, "mug": 2, "holding_mug": 1, "formal": 16, "business_attire": 18, "teal_shirt": 6, "white_necktie": 14, "feline": 10}, "stage3_selected_phrase_ranks": {"simple_background": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "vest": 1, "mug": 1, "holding_mug": 1, "formal": 1, "business_attire": 1, "teal_shirt": 1, "white_necktie": 1, "feline": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7183}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5558}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5993}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8841}, "necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7314}, "shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7998}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8403}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 2.86, "t2": 5.08, "t3": 0.66, "t3s": 4.7, "t3p": 7.96, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 18, "n_selected": 19, "n_implied": 5, "n_structural": 6, "n_probe": 4, "ret_R": 0.5, "P": 0.5263, "R": 0.7143, "F1": 0.6061, "leaf_P": 0.3333, "leaf_R": 0.4, "leaf_F1": 0.3636, "n_leaf_sel": 12, "n_leaf_gt": 10, "ret_P": 0.3889, "sel_given_ret": 1.4286, "over_sel": 1.36, "why": {"explicit": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5263, "gen_R": 0.7143, "gen_F1": 0.6061, "missed": ["fur", "hair", "human", "male"], "extra": ["anthro", "cheeky", "grin", "humanoid", "laugh", "raised_arms", "smile", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "cheeky", "clothed", "clothing", "dancing", "grin", "group", "haplorhine", "humanoid", "laugh", "looking_at_viewer", "mammal", "primate", "raised_arms", "smile", "topless", "trio"], "stage3_selected": ["ape", "bear", "cheeky", "dancing", "grin", "laugh", "looking_at_viewer", "raised_arms", "simple_background"], "stage3_selected_scores": {"simple_background": 0.5491, "looking_at_viewer": 0.5483, "bear": 0.5736, "grin": 0.5653, "dancing": 0.5576, "laugh": 0.526, "ape": 0.9767, "raised_arms": 0.5461, "cheeky": 0.3905}, "stage3_selected_ranks": {"simple_background": 8, "looking_at_viewer": 9, "bear": 3, "grin": 5, "dancing": 7, "laugh": 13, "ape": 1, "raised_arms": 10, "cheeky": 20}, "stage3_selected_phrase_ranks": {"simple_background": 1, "looking_at_viewer": 1, "bear": 1, "grin": 1, "dancing": 1, "laugh": 1, "ape": 1, "raised_arms": 1, "cheeky": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "cheeky": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3905}, "grin": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5653}, "humanoid": {"source": "structural"}, "laugh": {"source": "stage3", "why": "explicit", "retrieval_score": 0.526}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5461}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "humanoid", "clothed", "topless", "looking_at_viewer"], "probe": ["simple_background", "anthro", "group", "bear"], "t1": 5.6, "t2": 7.33, "t3": 5.32, "t3s": 4.65, "t3p": 5.08, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 14, "n_selected": 15, "n_implied": 3, "n_structural": 4, "n_probe": 4, "ret_R": 0.3571, "P": 0.6667, "R": 0.7143, "F1": 0.6897, "leaf_P": 0.6667, "leaf_R": 0.8889, "leaf_F1": 0.7619, "n_leaf_sel": 12, "n_leaf_gt": 9, "ret_P": 0.3571, "sel_given_ret": 2.0, "over_sel": 1.07, "why": {"explicit": 4, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 12, "attempts_by_n_local": {"15": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6667, "gen_R": 0.7143, "gen_F1": 0.6897, "missed": ["lagomorph", "leporid", "mammal", "rabbit"], "extra": ["<3", "coat", "looking_at_viewer", "relationship", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "looking_at_viewer", "plushie", "relationship", "romantic", "romantic_couple", "teal_eyes", "topwear"], "stage3_selected": ["blue_eyes", "coat", "plushie", "relationship", "romantic_couple", "teal_eyes"], "stage3_selected_scores": {"blue_eyes": 0.6105, "coat": 0.6317, "plushie": 0.6568, "teal_eyes": 0.6345, "romantic_couple": 0.5619, "relationship": 0.6088}, "stage3_selected_ranks": {"blue_eyes": 7, "coat": 5, "plushie": 3, "teal_eyes": 4, "romantic_couple": 11, "relationship": 8}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "coat": 1, "plushie": 1, "teal_eyes": 1, "romantic_couple": 1, "relationship": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6317}, "looking_at_viewer": {"source": "structural"}, "relationship": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6088}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["anthro", "blush", "duo", "<3"], "t1": 3.45, "t2": 6.07, "t3": 3.22, "t3s": 6.61, "t3p": 5.75, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=15 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 13, "n_selected": 15, "n_implied": 0, "n_structural": 4, "n_probe": 4, "ret_R": 0.75, "P": 0.2, "R": 0.75, "F1": 0.3158, "leaf_P": 0.2, "leaf_R": 0.75, "leaf_F1": 0.3158, "n_leaf_sel": 15, "n_leaf_gt": 4, "ret_P": 0.2308, "sel_given_ret": 1.0, "over_sel": 3.75, "why": {"explicit": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 10, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2, "gen_R": 0.75, "gen_F1": 0.3158, "missed": ["smile"], "extra": ["<3", "ambiguous_gender", "anthro", "cartoon", "clothing", "eyes", "feral", "floating", "nose", "nude", "round_eyes", "spots"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["<3", "ambiguous_gender", "anthro", "cartoon", "clothing", "eyes", "feral", "floating", "nose", "nude", "red_nose", "round_eyes", "solo", "spots", "tan_body"], "stage3_selected": ["cartoon", "eyes", "floating", "nose", "red_nose", "round_eyes", "spots", "tan_body", "white_background"], "stage3_selected_scores": {"white_background": 0.6267, "tan_body": 0.6777, "spots": 0.6331, "red_nose": 0.7461, "floating": 0.6778, "round_eyes": 0.8856, "cartoon": 0.514, "nose": 0.8851, "eyes": 0.929}, "stage3_selected_ranks": {"white_background": 11, "tan_body": 7, "spots": 10, "red_nose": 4, "floating": 6, "round_eyes": 2, "cartoon": 15, "nose": 3, "eyes": 1}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 1, "spots": 1, "red_nose": 1, "floating": 1, "round_eyes": 1, "cartoon": 1, "nose": 1, "eyes": 1}, "extra_evidence": {"<3": {"source": "probe"}, "ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "cartoon": {"source": "stage3", "why": "explicit", "retrieval_score": 0.514}, "clothing": {"source": "probe"}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.929}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6778}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8851}, "nude": {"source": "structural"}, "round_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8856}, "spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6331}}, "structural": ["solo", "feral", "ambiguous_gender", "nude"], "probe": ["anthro", "simple_background", "clothing", "<3"], "t1": 3.09, "t2": 6.6, "t3": 6.66, "t3s": 5.75, "t3p": 5.14, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 18, "n_selected": 20, "n_implied": 6, "n_structural": 5, "n_probe": 3, "ret_R": 0.2273, "P": 0.65, "R": 0.5909, "F1": 0.619, "leaf_P": 0.2857, "leaf_R": 0.3333, "leaf_F1": 0.3077, "n_leaf_sel": 14, "n_leaf_gt": 12, "ret_P": 0.2778, "sel_given_ret": 2.6, "over_sel": 0.91, "why": {"explicit": 5, "strong_implied": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.65, "gen_R": 0.5909, "gen_F1": 0.619, "missed": ["chest_tuft", "countershading", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["looking_at_viewer", "muscular_arms", "playful", "pose", "striped_body", "striped_fur", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_eyes", "bottomwear", "clothed", "clothing", "felid", "fur", "hand_on_head", "looking_at_viewer", "male", "mammal", "muscular_arms", "playful", "pose", "shorts", "solo", "striped_body", "striped_fur", "stripes", "white_chest"], "stage3_selected": ["blue_eyes", "hand_on_head", "muscular_arms", "playful", "pose", "shorts", "striped_fur", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5717, "shorts": 0.5785, "striped_fur": 0.6385, "hand_on_head": 0.5932, "white_chest": 0.9198, "pose": 0.6235, "muscular_arms": 0.7948, "playful": 0.4236}, "stage3_selected_ranks": {"blue_eyes": 14, "shorts": 13, "striped_fur": 8, "hand_on_head": 11, "white_chest": 2, "pose": 9, "muscular_arms": 4, "playful": 19}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "shorts": 1, "striped_fur": 1, "hand_on_head": 1, "white_chest": 1, "pose": 1, "muscular_arms": 1, "playful": 1}, "extra_evidence": {"looking_at_viewer": {"source": "structural"}, "muscular_arms": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7948}, "playful": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4236}, "pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6235}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6385}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9198}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "felid", "solo"], "t1": 1.21, "t2": 1.33, "t3": 0.77, "t3s": 1.08, "t3p": 1.28, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 16, "n_selected": 7, "n_implied": 0, "n_structural": 3, "n_probe": 4, "ret_R": 0.2308, "P": 0.1429, "R": 0.0769, "F1": 0.1, "leaf_P": 0.1429, "leaf_R": 0.1667, "leaf_F1": 0.1538, "n_leaf_sel": 7, "n_leaf_gt": 6, "ret_P": 0.1875, "sel_given_ret": 0.3333, "over_sel": 0.54, "why": {"explicit": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 12, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1429, "gen_R": 0.0769, "gen_F1": 0.1, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothing", "darkness", "group", "light", "solo"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothing", "darkness", "group", "light", "solo", "text"], "stage3_selected": ["darkness", "light"], "stage3_selected_scores": {"light": 0.7785, "darkness": 0.8348}, "stage3_selected_ranks": {"light": 4, "darkness": 2}, "stage3_selected_phrase_ranks": {"light": 1, "darkness": 1}, "extra_evidence": {"anthro": {"source": "probe"}, "clothing": {"source": "probe"}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8348}, "group": {"source": "structural"}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7785}, "solo": {"source": "structural"}}, "structural": ["solo", "group", "text"], "probe": ["clothing", "anthro", "text", "group"], "t1": 2.27, "t2": 1.61, "t3": 6.21, "t3s": 1.43, "t3p": 3.67, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 22, "n_selected": 14, "n_implied": 1, "n_structural": 3, "n_probe": 3, "ret_R": 0.5333, "P": 0.6429, "R": 0.6, "F1": 0.6207, "leaf_P": 0.5385, "leaf_R": 0.5833, "leaf_F1": 0.56, "n_leaf_sel": 13, "n_leaf_gt": 12, "ret_P": 0.3636, "sel_given_ret": 1.125, "over_sel": 0.93, "why": {"explicit": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 19, "attempts_by_n_local": {"23": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6429, "gen_R": 0.6, "gen_F1": 0.6207, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture", "lying"], "extra": ["annoyed_expression", "anthro", "bedroom", "humanoid", "resting"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "bedroom", "blonde_hair", "blue_eyes", "duo", "green_eyes", "hair", "humanoid", "makeup", "purple_hair", "resting", "sleeping", "text"], "stage3_selected": ["annoyed_expression", "bedroom", "blonde_hair", "blue_eyes", "green_eyes", "makeup", "purple_hair", "resting", "sleeping", "text"], "stage3_selected_scores": {"text": 0.6007, "blue_eyes": 0.6014, "green_eyes": 0.5989, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "bedroom": 0.4901, "sleeping": 0.6027, "resting": 0.5144, "annoyed_expression": 0.7251}, "stage3_selected_ranks": {"text": 8, "blue_eyes": 7, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "bedroom": 19, "sleeping": 6, "resting": 17, "annoyed_expression": 2}, "stage3_selected_phrase_ranks": {"text": 1, "blue_eyes": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "bedroom": 1, "sleeping": 1, "resting": 1, "annoyed_expression": 1}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "humanoid": {"source": "structural"}, "resting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5144}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.06, "t2": 2.13, "t3": 6.29, "t3s": 1.01, "t3p": 1.25, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=23 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 16, "n_selected": 18, "n_implied": 7, "n_structural": 4, "n_probe": 3, "ret_R": 0.2273, "P": 0.7778, "R": 0.6364, "F1": 0.7, "leaf_P": 0.4545, "leaf_R": 0.3846, "leaf_F1": 0.4167, "n_leaf_sel": 11, "n_leaf_gt": 13, "ret_P": 0.3125, "sel_given_ret": 2.8, "over_sel": 0.82, "why": {"explicit": 5, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 10, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7778, "gen_R": 0.6364, "gen_F1": 0.7, "missed": ["bass_guitar", "canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music", "torn_clothing"], "extra": ["flowing_hair", "male", "pastel_background", "pose"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "canid", "claws", "clothed", "clothing", "flowing_hair", "guitar", "hair", "male", "mammal", "musical_instrument", "pastel_background", "plucked_string_instrument", "pose", "solo", "spade_tail", "string_instrument", "tail"], "stage3_selected": ["claws", "flowing_hair", "guitar", "pastel_background", "pose", "spade_tail"], "stage3_selected_scores": {"claws": 0.5684, "spade_tail": 0.618, "guitar": 0.9623, "flowing_hair": 0.5669, "pastel_background": 0.5632, "pose": 0.5761}, "stage3_selected_ranks": {"claws": 8, "spade_tail": 3, "guitar": 1, "flowing_hair": 9, "pastel_background": 11, "pose": 6}, "stage3_selected_phrase_ranks": {"claws": 1, "spade_tail": 1, "guitar": 1, "flowing_hair": 1, "pastel_background": 1, "pose": 1}, "extra_evidence": {"flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5669}, "male": {"source": "structural"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5632}, "pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5761}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 3.66, "t2": 1.37, "t3": 2.75, "t3s": 1.5, "t3p": 1.9, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 18, "n_selected": 32, "n_implied": 14, "n_structural": 4, "n_probe": 3, "ret_R": 0.44, "P": 0.7188, "R": 0.92, "F1": 0.807, "leaf_P": 0.6875, "leaf_R": 0.7333, "leaf_F1": 0.7097, "n_leaf_sel": 16, "n_leaf_gt": 15, "ret_P": 0.6111, "sel_given_ret": 2.0909, "over_sel": 1.28, "why": {"explicit": 14}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 19, "attempts_by_n_local": {"20": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7188, "gen_R": 0.92, "gen_F1": 0.807, "missed": ["looking_at_another", "standing"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "looking_at_viewer", "open_mouth", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "rabbit", "shirt", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "blue_overalls", "claws", "crossed_arms", "facial_markings", "fox", "fur", "grey_background", "open_mouth", "overalls", "pants", "rabbit", "shirt", "white_shirt"], "stage3_selected_scores": {"fur": 0.6531, "open_mouth": 0.633, "claws": 0.6303, "fox": 0.638, "shirt": 0.7483, "rabbit": 0.6511, "pants": 0.7589, "grey_background": 0.6784, "facial_markings": 0.6945, "crossed_arms": 0.7285, "white_shirt": 0.8197, "overalls": 0.8776, "black_pants": 0.833, "blue_overalls": 0.9203}, "stage3_selected_ranks": {"fur": 11, "open_mouth": 14, "claws": 15, "fox": 13, "shirt": 6, "rabbit": 12, "pants": 5, "grey_background": 10, "facial_markings": 8, "crossed_arms": 7, "white_shirt": 4, "overalls": 2, "black_pants": 3, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "fox": 1, "shirt": 1, "rabbit": 1, "pants": 1, "grey_background": 1, "facial_markings": 1, "crossed_arms": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1, "blue_overalls": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.833}, "blue_overalls": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9203}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.633}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8197}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.68, "t2": 1.57, "t3": 1.21, "t3s": 0.41, "t3p": 3.87, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=20 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 23, "n_selected": 21, "n_implied": 8, "n_structural": 4, "n_probe": 3, "ret_R": 0.5455, "P": 0.381, "R": 0.7273, "F1": 0.5, "leaf_P": 0.3333, "leaf_R": 0.5714, "leaf_F1": 0.4211, "n_leaf_sel": 12, "n_leaf_gt": 7, "ret_P": 0.2609, "sel_given_ret": 1.3333, "over_sel": 1.91, "why": {"explicit": 6, "strong_implied": 3}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 13, "attempts_by_n_local": {"26": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.381, "gen_R": 0.7273, "gen_F1": 0.5, "missed": ["open_mouth", "white_body", "white_fur"], "extra": ["action_pose", "animal_humanoid", "anthro", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "humanoid", "male", "mammal_humanoid", "pose", "tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["action_pose", "animal_humanoid", "anthro", "blue_eyes", "blue_nose", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "fur", "humanoid", "male", "mammal", "mammal_humanoid", "pose", "purple_body", "solo", "tail"], "stage3_selected": ["action_pose", "blue_eyes", "blue_nose", "canine_humanoid", "curved_tail", "fur", "purple_body", "simple_background", "tail"], "stage3_selected_scores": {"fur": 0.5841, "simple_background": 0.5948, "blue_eyes": 0.5995, "purple_body": 0.564, "canine_humanoid": 0.9003, "blue_nose": 0.6032, "tail": 0.6107, "action_pose": 0.617, "curved_tail": 0.637}, "stage3_selected_ranks": {"fur": 18, "simple_background": 17, "blue_eyes": 15, "purple_body": 21, "canine_humanoid": 1, "blue_nose": 12, "tail": 10, "action_pose": 9, "curved_tail": 7}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "blue_eyes": 1, "purple_body": 1, "canine_humanoid": 1, "blue_nose": 1, "tail": 1, "action_pose": 1, "curved_tail": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.617}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9003}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.637}, "humanoid": {"source": "implied"}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "pose": {"source": "implied"}, "tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6107}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 3.87, "t2": 1.89, "t3": 3.1, "t3s": 1.78, "t3p": 0.98, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=26 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
data/eval_results/latency_k4_seed43.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T08:45:18.372193", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 4, "temperature": 0.0, "shuffle": false, "seed": 43, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 22}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 65, "n_selected": 39, "n_implied": 17, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.2051, "R": 0.6667, "F1": 0.3137, "leaf_P": 0.1111, "leaf_R": 0.2222, "leaf_F1": 0.1481, "n_leaf_sel": 18, "n_leaf_gt": 9, "ret_P": 0.0462, "sel_given_ret": 2.6667, "over_sel": 3.25, "why": {"explicit": 15, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 43, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "9": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.3333333333333333, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2051, "gen_R": 0.6667, "gen_F1": 0.3137, "missed": ["alpha_channel", "fingers", "fur", "male"], "extra": ["bottom_heavy", "business_attire", "container", "cup", "domestic_cat", "felis", "formal", "gesture", "grey_clothing", "grey_shirt", "grey_topwear", "hair_bun", "handshake", "holding_container", "holding_cup", "holding_mug", "holding_object", "mug", "necktie", "ranged_weapon", "raygun", "shirt", "suit_jacket", "teal_shirt", "teal_topwear", "text", "topless", "topwear", "vest", "weapon", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "bottom_heavy", "business_attire", "clothed", "clothing", "container", "cup", "domestic_cat", "felid", "feline", "felis", "formal", "gesture", "grey_clothing", "grey_shirt", "grey_topwear", "hair", "hair_bun", "handshake", "holding_container", "holding_cup", "holding_mug", "holding_object", "mammal", "mug", "necktie", "ranged_weapon", "raygun", "shirt", "solo", "suit_jacket", "teal_shirt", "teal_topwear", "text", "topless", "topwear", "vest", "weapon", "white_necktie"], "stage3_selected": ["bottom_heavy", "business_attire", "domestic_cat", "feline", "formal", "grey_shirt", "hair_bun", "handshake", "holding_cup", "holding_mug", "invalid_background", "raygun", "simple_background", "suit_jacket", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"simple_background": 0.7012, "feline": 0.7092, "vest": 0.8437, "holding_cup": 0.7694, "hair_bun": 0.6946, "bottom_heavy": 0.468, "grey_shirt": 0.7606, "holding_mug": 0.9184, "suit_jacket": 0.5953, "handshake": 0.5545, "formal": 0.601, "business_attire": 0.5683, "teal_shirt": 0.7483, "white_necktie": 0.644, "invalid_background": 0.6512, "domestic_cat": 0.6355, "raygun": 0.4506}, "stage3_selected_ranks": {"simple_background": 28, "feline": 26, "vest": 3, "holding_cup": 10, "hair_bun": 29, "bottom_heavy": 68, "grey_shirt": 12, "holding_mug": 1, "suit_jacket": 48, "handshake": 59, "formal": 46, "business_attire": 54, "teal_shirt": 17, "white_necktie": 33, "invalid_background": 31, "domestic_cat": 37, "raygun": 69}, "stage3_selected_phrase_ranks": {"simple_background": 1, "feline": 1, "vest": 1, "holding_cup": 4, "hair_bun": 1, "bottom_heavy": 4, "grey_shirt": 1, "holding_mug": 1, "suit_jacket": 4, "handshake": 3, "formal": 1, "business_attire": 1, "teal_shirt": 4, "white_necktie": 4, "invalid_background": 1, "domestic_cat": 4, "raygun": 4}, "extra_evidence": {"bottom_heavy": {"source": "stage3", "why": "explicit", "retrieval_score": 0.468}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5683}, "container": {"source": "implied"}, "cup": {"source": "implied"}, "domestic_cat": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6355}, "felis": {"source": "implied"}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.601}, "gesture": {"source": "implied"}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7606}, "grey_topwear": {"source": "implied"}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6946}, "handshake": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5545}, "holding_container": {"source": "implied"}, "holding_cup": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7694}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9184}, "holding_object": {"source": "implied"}, "mug": {"source": "implied"}, "necktie": {"source": "implied"}, "ranged_weapon": {"source": "implied"}, "raygun": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4506}, "shirt": {"source": "implied"}, "suit_jacket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5953}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7483}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topless": {"source": "structural"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8437}, "weapon": {"source": "implied"}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.644}}, "structural": ["solo", "anthro", "clothed", "topless"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 3.76, "t2": 5.25, "t3": 16.36, "t3s": 3.62, "t3p": 5.07, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=69 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"other\"}, {\"i\": 13, \"why\": \"other\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"other\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"other\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 45, \"why\": \"other\"}, {\"i\": 46, \"why\": \"other\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 48, \"why\": \"other\"}, {\"i\": 49, \"why\": \"explicit\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 51, \"why\": \"other\"}, {\"i\": 52}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.38.why\n Field required [type=missing, input_value={'i': 52}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 66, "n_selected": 30, "n_implied": 10, "n_structural": 7, "n_probe": 6, "ret_R": 0.5714, "P": 0.3667, "R": 0.7857, "F1": 0.5, "leaf_P": 0.1875, "leaf_R": 0.3, "leaf_F1": 0.2308, "n_leaf_sel": 16, "n_leaf_gt": 10, "ret_P": 0.1212, "sel_given_ret": 1.375, "over_sel": 2.14, "why": {"explicit": 13}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 28, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "9": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3667, "gen_R": 0.7857, "gen_F1": 0.5, "missed": ["fur", "hair", "human"], "extra": ["<3", "anthro", "chimpanzee", "duo", "feral", "gorilla", "grinning_at_viewer", "one_eye_closed", "pan_(genus)", "raised_arms", "smile", "smiling_at_viewer", "smirk", "smirking_at_viewer", "topless", "trio", "wide_grin", "wink", "winking_at_viewer"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["<3", "anthro", "ape", "bear", "chimpanzee", "clothed", "clothing", "dancing", "duo", "feral", "gorilla", "grinning_at_viewer", "group", "haplorhine", "looking_at_viewer", "male", "mammal", "one_eye_closed", "pan_(genus)", "primate", "raised_arms", "smile", "smiling_at_viewer", "smirk", "smirking_at_viewer", "topless", "trio", "wide_grin", "wink", "winking_at_viewer"], "stage3_selected": ["bear", "chimpanzee", "dancing", "gorilla", "grinning_at_viewer", "looking_at_viewer", "male", "primate", "raised_arms", "simple_background", "smirking_at_viewer", "wide_grin", "winking_at_viewer"], "stage3_selected_scores": {"male": 0.5604, "simple_background": 0.5491, "looking_at_viewer": 0.5475, "bear": 0.5735, "primate": 0.8905, "dancing": 0.5568, "raised_arms": 0.5445, "gorilla": 0.8299, "winking_at_viewer": 0.404, "chimpanzee": 0.8275, "smirking_at_viewer": 0.4352, "grinning_at_viewer": 0.442, "wide_grin": 0.5267}, "stage3_selected_ranks": {"male": 9, "simple_background": 11, "looking_at_viewer": 12, "bear": 6, "primate": 2, "dancing": 10, "raised_arms": 13, "gorilla": 4, "winking_at_viewer": 47, "chimpanzee": 5, "smirking_at_viewer": 34, "grinning_at_viewer": 32, "wide_grin": 15}, "stage3_selected_phrase_ranks": {"male": 1, "simple_background": 1, "looking_at_viewer": 1, "bear": 1, "primate": 1, "dancing": 1, "raised_arms": 1, "gorilla": 2, "winking_at_viewer": 4, "chimpanzee": 3, "smirking_at_viewer": 4, "grinning_at_viewer": 3, "wide_grin": 1}, "extra_evidence": {"<3": {"source": "probe"}, "anthro": {"source": "structural"}, "chimpanzee": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8275}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "gorilla": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8299}, "grinning_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.442}, "one_eye_closed": {"source": "implied"}, "pan_(genus)": {"source": "implied"}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5445}, "smile": {"source": "implied"}, "smiling_at_viewer": {"source": "implied"}, "smirk": {"source": "implied"}, "smirking_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4352}, "topless": {"source": "structural"}, "trio": {"source": "structural"}, "wide_grin": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5267}, "wink": {"source": "implied"}, "winking_at_viewer": {"source": "stage3", "why": "explicit", "retrieval_score": 0.404}}, "structural": ["trio", "anthro", "feral", "male", "clothed", "topless", "looking_at_viewer"], "probe": ["anthro", "duo", "group", "bear", "simple_background", "<3"], "t1": 2.82, "t2": 5.79, "t3": 12.87, "t3s": 3.36, "t3p": 4.98, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=69 entity=1 copyright_filtered=1 generic_char_to_general=1 unknown_type=2"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 62, "n_selected": 29, "n_implied": 9, "n_structural": 3, "n_probe": 4, "ret_R": 0.6429, "P": 0.4828, "R": 1.0, "F1": 0.6512, "leaf_P": 0.4444, "leaf_R": 0.8889, "leaf_F1": 0.5926, "n_leaf_sel": 18, "n_leaf_gt": 9, "ret_P": 0.1452, "sel_given_ret": 1.5556, "over_sel": 2.07, "why": {"explicit": 14, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 3, "dupe_indices_total": 0, "kept_total": 26, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "3": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4828, "gen_R": 1.0, "gen_F1": 0.6512, "missed": [], "extra": ["<3", "coat", "cuddling", "holding_object", "holding_plushie", "padding", "raincoat", "red_clothing", "red_coat", "red_topwear", "relationship", "rosy_cheeks", "teal_body", "topwear", "wide_eyed"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "cuddling", "duo", "holding_object", "holding_plushie", "lagomorph", "leporid", "mammal", "padding", "plushie", "rabbit", "raincoat", "red_clothing", "red_coat", "red_topwear", "relationship", "romantic", "romantic_couple", "rosy_cheeks", "teal_body", "teal_eyes", "topwear", "wide_eyed"], "stage3_selected": ["blue_eyes", "coat", "cuddling", "holding_plushie", "padding", "rabbit", "raincoat", "red_coat", "relationship", "romantic", "romantic_couple", "rosy_cheeks", "teal_body", "teal_eyes", "wide_eyed"], "stage3_selected_scores": {"blue_eyes": 0.615, "rabbit": 0.5939, "romantic": 0.5602, "romantic_couple": 0.562, "coat": 0.6383, "wide_eyed": 0.4616, "cuddling": 0.4804, "teal_eyes": 0.6283, "rosy_cheeks": 0.472, "teal_body": 0.4519, "holding_plushie": 0.7793, "raincoat": 0.5262, "red_coat": 0.5207, "relationship": 0.6206, "padding": 0.4927}, "stage3_selected_ranks": {"blue_eyes": 12, "rabbit": 13, "romantic": 18, "romantic_couple": 17, "coat": 7, "wide_eyed": 49, "cuddling": 42, "teal_eyes": 8, "rosy_cheeks": 45, "teal_body": 54, "holding_plushie": 2, "raincoat": 29, "red_coat": 32, "relationship": 9, "padding": 38}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "rabbit": 1, "romantic": 2, "romantic_couple": 1, "coat": 1, "wide_eyed": 4, "cuddling": 4, "teal_eyes": 1, "rosy_cheeks": 2, "teal_body": 4, "holding_plushie": 1, "raincoat": 2, "red_coat": 4, "relationship": 1, "padding": 4}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "cuddling": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4804}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "padding": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4927}, "raincoat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5262}, "red_clothing": {"source": "implied"}, "red_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5207}, "red_topwear": {"source": "implied"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "rosy_cheeks": {"source": "stage3", "why": "explicit", "retrieval_score": 0.472}, "teal_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4519}, "topwear": {"source": "implied"}, "wide_eyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4616}}, "structural": ["duo", "anthro", "clothed"], "probe": ["anthro", "blush", "duo", "<3"], "t1": 2.97, "t2": 6.21, "t3": 10.05, "t3s": 3.13, "t3p": 4.08, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=63 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 50, "n_selected": 11, "n_implied": 0, "n_structural": 5, "n_probe": 3, "ret_R": 0.75, "P": 0.2727, "R": 0.75, "F1": 0.4, "leaf_P": 0.2727, "leaf_R": 0.75, "leaf_F1": 0.4, "n_leaf_sel": 11, "n_leaf_gt": 4, "ret_P": 0.06, "sel_given_ret": 1.0, "over_sel": 2.75, "why": {"explicit": 6}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 10, "attempts_by_n_local": {"52": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.2727, "gen_R": 0.75, "gen_F1": 0.4, "missed": ["smile"], "extra": ["ambiguous_gender", "anthro", "big_eyes", "feral", "floating", "looking_at_viewer", "nude", "spotted_face"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "big_eyes", "feral", "floating", "looking_at_viewer", "nude", "red_nose", "solo", "spotted_face", "tan_body"], "stage3_selected": ["big_eyes", "floating", "red_nose", "spotted_face", "tan_body", "white_background"], "stage3_selected_scores": {"white_background": 0.6138, "tan_body": 0.6628, "big_eyes": 0.6961, "red_nose": 0.7501, "floating": 0.6519, "spotted_face": 0.6967}, "stage3_selected_ranks": {"white_background": 21, "tan_body": 11, "big_eyes": 6, "red_nose": 3, "floating": 13, "spotted_face": 5}, "stage3_selected_phrase_ranks": {"white_background": 1, "tan_body": 4, "big_eyes": 1, "red_nose": 1, "floating": 1, "spotted_face": 2}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6961}, "feral": {"source": "structural"}, "floating": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6519}, "looking_at_viewer": {"source": "structural"}, "nude": {"source": "structural"}, "spotted_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6967}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["anthro", "simple_background", "solo"], "t1": 2.62, "t2": 5.68, "t3": 2.6, "t3s": 3.81, "t3p": 4.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=52 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=4"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 80, "n_selected": 26, "n_implied": 8, "n_structural": 4, "n_probe": 3, "ret_R": 0.3182, "P": 0.5, "R": 0.5909, "F1": 0.5417, "leaf_P": 0.125, "leaf_R": 0.1667, "leaf_F1": 0.1429, "n_leaf_sel": 16, "n_leaf_gt": 12, "ret_P": 0.0875, "sel_given_ret": 1.8571, "over_sel": 1.18, "why": {"explicit": 6, "strong_implied": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 38, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "20": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.5909, "gen_F1": 0.5417, "missed": ["chest_tuft", "countershading", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless"], "extra": ["blue_bottomwear", "blue_clothing", "blue_shorts", "firelight", "gesture", "hand_on_own_head", "light", "lighting", "muscular_legs", "raised_hand", "striped_body", "striped_fur", "white_chest"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_bottomwear", "blue_clothing", "blue_eyes", "blue_shorts", "bottomwear", "clothed", "clothing", "felid", "firelight", "fur", "gesture", "hand_on_own_head", "light", "lighting", "male", "mammal", "muscular_legs", "raised_hand", "shorts", "solo", "striped_body", "striped_fur", "stripes", "tuft", "white_chest"], "stage3_selected": ["blue_eyes", "blue_shorts", "firelight", "gesture", "hand_on_own_head", "lighting", "muscular_legs", "raised_hand", "shorts", "striped_body", "striped_fur", "tuft", "white_chest"], "stage3_selected_scores": {"blue_eyes": 0.5973, "tuft": 0.5246, "gesture": 0.6156, "striped_body": 0.4667, "raised_hand": 0.7153, "hand_on_own_head": 0.5995, "shorts": 0.6091, "striped_fur": 0.6688, "lighting": 0.7417, "muscular_legs": 0.7909, "blue_shorts": 0.6425, "white_chest": 0.9284, "firelight": 0.6667}, "stage3_selected_ranks": {"blue_eyes": 45, "tuft": 59, "gesture": 37, "striped_body": 77, "raised_hand": 21, "hand_on_own_head": 44, "shorts": 42, "striped_fur": 28, "lighting": 16, "muscular_legs": 10, "blue_shorts": 33, "white_chest": 2, "firelight": 29}, "stage3_selected_phrase_ranks": {"blue_eyes": 2, "tuft": 4, "gesture": 1, "striped_body": 1, "raised_hand": 1, "hand_on_own_head": 4, "shorts": 1, "striped_fur": 2, "lighting": 4, "muscular_legs": 2, "blue_shorts": 3, "white_chest": 1, "firelight": 4}, "extra_evidence": {"blue_bottomwear": {"source": "implied"}, "blue_clothing": {"source": "implied"}, "blue_shorts": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6425}, "firelight": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6667}, "gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6156}, "hand_on_own_head": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5995}, "light": {"source": "implied"}, "lighting": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7417}, "muscular_legs": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7909}, "raised_hand": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7153}, "striped_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4667}, "striped_fur": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6688}, "white_chest": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.9284}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "felid", "solo"], "t1": 1.09, "t2": 1.69, "t3": 6.94, "t3s": 0.9, "t3p": 2.79, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=80 entity=1 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 71, "n_selected": 36, "n_implied": 7, "n_structural": 5, "n_probe": 6, "ret_R": 0.3846, "P": 0.25, "R": 0.6923, "F1": 0.3673, "leaf_P": 0.125, "leaf_R": 0.5, "leaf_F1": 0.2, "n_leaf_sel": 24, "n_leaf_gt": 6, "ret_P": 0.0704, "sel_given_ret": 1.8, "over_sel": 2.77, "why": {"explicit": 21, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 47, "attempts_by_n_local": {"60": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}, "9": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.3333333333333333, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.25, "gen_R": 0.6923, "gen_F1": 0.3673, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["action_figure", "agamid", "anthro", "clothed", "clothing", "dark", "darkness", "emote", "frilled_lizard", "gecko", "group", "guardian", "light", "lying_on_ground", "mask", "medical_instrument", "note", "note_pad", "on_ground", "pointy_speech_bubble", "scientific_instrument", "solo", "speech_bubble", "standing_over", "surgical_mask", "taur", "yuman"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["action_figure", "agamid", "anthro", "bovid", "caprine", "clothed", "clothing", "dark", "darkness", "emote", "frilled_lizard", "gecko", "goat", "group", "guardian", "human", "light", "lizard", "lying_on_ground", "mammal", "mask", "medical_instrument", "note", "note_pad", "on_ground", "pointy_speech_bubble", "reptile", "scalie", "scientific_instrument", "solo", "speech_bubble", "standing_over", "surgical_mask", "taur", "text", "yuman"], "stage3_selected": ["action_figure", "bovid", "caprine", "dark", "darkness", "emote", "frilled_lizard", "gecko", "goat", "guardian", "human", "light", "lizard", "lying_on_ground", "note", "note_pad", "on_ground", "pointy_speech_bubble", "speech_bubble", "standing_over", "surgical_mask", "yuman"], "stage3_selected_scores": {"human": 0.5572, "speech_bubble": 0.5746, "bovid": 0.4536, "caprine": 0.4677, "lizard": 0.5943, "goat": 0.5777, "light": 0.5824, "on_ground": 0.4822, "dark": 0.4091, "gecko": 0.4436, "pointy_speech_bubble": 0.4666, "lying_on_ground": 0.5929, "darkness": 0.5977, "note": 0.5658, "emote": 0.3803, "yuman": 0.3939, "frilled_lizard": 0.4581, "standing_over": 0.5799, "surgical_mask": 0.369, "note_pad": 0.4164, "guardian": 0.3707, "action_figure": 0.4064}, "stage3_selected_ranks": {"human": 12, "speech_bubble": 9, "bovid": 28, "caprine": 23, "lizard": 3, "goat": 8, "light": 6, "on_ground": 18, "dark": 42, "gecko": 31, "pointy_speech_bubble": 25, "lying_on_ground": 4, "darkness": 2, "note": 11, "emote": 53, "yuman": 51, "frilled_lizard": 26, "standing_over": 7, "surgical_mask": 63, "note_pad": 38, "guardian": 62, "action_figure": 44}, "stage3_selected_phrase_ranks": {"human": 1, "speech_bubble": 1, "bovid": 4, "caprine": 3, "lizard": 1, "goat": 1, "light": 1, "on_ground": 3, "dark": 3, "gecko": 4, "pointy_speech_bubble": 4, "lying_on_ground": 1, "darkness": 1, "note": 1, "emote": 4, "yuman": 4, "frilled_lizard": 2, "standing_over": 1, "surgical_mask": 4, "note_pad": 2, "guardian": 3, "action_figure": 4}, "extra_evidence": {"action_figure": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4064}, "agamid": {"source": "implied"}, "anthro": {"source": "structural"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "dark": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4091}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5977}, "emote": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3803}, "frilled_lizard": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4581}, "gecko": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4436}, "group": {"source": "structural"}, "guardian": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3707}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5824}, "lying_on_ground": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5929}, "mask": {"source": "implied"}, "medical_instrument": {"source": "implied"}, "note": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5658}, "note_pad": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4164}, "on_ground": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4822}, "pointy_speech_bubble": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4666}, "scientific_instrument": {"source": "implied"}, "solo": {"source": "probe"}, "speech_bubble": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5746}, "standing_over": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5799}, "surgical_mask": {"source": "stage3", "why": "explicit", "retrieval_score": 0.369}, "taur": {"source": "structural"}, "yuman": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3939}}, "structural": ["group", "anthro", "taur", "clothed", "text"], "probe": ["clothing", "simple_background", "anthro", "text", "solo", "group"], "t1": 2.16, "t2": 1.47, "t3": 30.46, "t3s": 0.7, "t3p": 2.35, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=69 entity=0 copyright_filtered=2 generic_char_to_general=0 unknown_type=1", "Stage3 general_chunk_0: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"other\"}, {\"i\": 2, \"why\": \"other\"}, {\"i\": 3, \"why\": \"other\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"strong_implied\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"other\"}, {\"i\": 10, \"why\": \"other\"}, {\"i\": 11, \"why\": \"strong_implied\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"strong_implied\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"other\"}, {\"i\": 19, \"why\": \"other\"}, {\"i\": 20, \"why\": \"other\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 24, \"why\": \"other\"}, {\"i\": 25, \"why\": \"other\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 27, \"why\": \"other\"}, {\"i\": 28, \"why\": \"other\"}, {\"i\": 29, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"strong_implied\"}, {\"i\": 32, \"why\": \"other\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"other\"}, {\"i\": 37, \"why\": \"explicit\"}, {\"i\": 38}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.37.why\n Field required [type=missing, input_value={'i': 38}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 94, "n_selected": 23, "n_implied": 5, "n_structural": 4, "n_probe": 2, "ret_R": 0.7333, "P": 0.5217, "R": 0.8, "F1": 0.6316, "leaf_P": 0.4375, "leaf_R": 0.5833, "leaf_F1": 0.5, "n_leaf_sel": 16, "n_leaf_gt": 12, "ret_P": 0.117, "sel_given_ret": 1.0909, "over_sel": 1.53, "why": {"explicit": 16}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 31, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "36": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5217, "gen_R": 0.8, "gen_F1": 0.6316, "missed": ["angry", "eyes_closed", "sleeping"], "extra": ["annoyed_expression", "anthro", "atmosphere", "calm", "clothed", "clothing", "english_text", "lying_on_bed", "on_bed", "pajamas", "sphere"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "atmosphere", "bed", "blonde_hair", "blue_eyes", "calm", "clothed", "clothing", "duo", "english_text", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "lying_on_bed", "makeup", "on_bed", "pajamas", "purple_hair", "sphere", "text"], "stage3_selected": ["annoyed_expression", "atmosphere", "blonde_hair", "blue_eyes", "calm", "duo", "english_text", "eyeshadow", "green_eyes", "lying", "lying_on_bed", "makeup", "pajamas", "purple_hair", "sphere", "text"], "stage3_selected_scores": {"duo": 0.4298, "text": 0.594, "blue_eyes": 0.595, "lying": 0.4445, "green_eyes": 0.5934, "blonde_hair": 0.5873, "purple_hair": 0.5592, "makeup": 0.5894, "eyeshadow": 0.4713, "lying_on_bed": 0.4059, "pajamas": 0.371, "annoyed_expression": 0.7219, "calm": 0.3466, "sphere": 0.4546, "atmosphere": 0.5039, "english_text": 0.4128}, "stage3_selected_ranks": {"duo": 54, "text": 7, "blue_eyes": 6, "lying": 46, "green_eyes": 9, "blonde_hair": 11, "purple_hair": 14, "makeup": 10, "eyeshadow": 39, "lying_on_bed": 73, "pajamas": 84, "annoyed_expression": 2, "calm": 90, "sphere": 43, "atmosphere": 26, "english_text": 67}, "stage3_selected_phrase_ranks": {"duo": 2, "text": 1, "blue_eyes": 1, "lying": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "lying_on_bed": 4, "pajamas": 4, "annoyed_expression": 1, "calm": 4, "sphere": 2, "atmosphere": 1, "english_text": 4}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7219}, "anthro": {"source": "structural"}, "atmosphere": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5039}, "calm": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3466}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "english_text": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4128}, "lying_on_bed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4059}, "on_bed": {"source": "implied"}, "pajamas": {"source": "stage3", "why": "explicit", "retrieval_score": 0.371}, "sphere": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4546}}, "structural": ["duo", "anthro", "clothed", "text"], "probe": ["anthro", "duo"], "t1": 2.14, "t2": 2.07, "t3": 11.58, "t3s": 1.41, "t3p": 1.3, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=96 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 45, "n_selected": 28, "n_implied": 13, "n_structural": 4, "n_probe": 3, "ret_R": 0.1818, "P": 0.5, "R": 0.6364, "F1": 0.56, "leaf_P": 0.4167, "leaf_R": 0.3846, "leaf_F1": 0.4, "n_leaf_sel": 12, "n_leaf_gt": 13, "ret_P": 0.0889, "sel_given_ret": 3.5, "over_sel": 1.27, "why": {"explicit": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 24, "attempts_by_n_local": {"48": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.6364, "gen_F1": 0.56, "missed": ["canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music", "spade_tail", "tail"], "extra": ["bottomwear", "denim", "denim_clothing", "electric_guitar", "flowing_hair", "jeans", "looking_at_viewer", "pants", "pastel_background", "playing_guitar", "playing_music", "torn_bottomwear", "torn_jeans", "torn_pants"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bass_guitar", "bottomwear", "canid", "claws", "clothed", "clothing", "denim", "denim_clothing", "electric_guitar", "flowing_hair", "guitar", "hair", "jeans", "looking_at_viewer", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "solo", "string_instrument", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants"], "stage3_selected": ["bass_guitar", "claws", "electric_guitar", "flowing_hair", "guitar", "pastel_background", "playing_guitar", "torn_bottomwear", "torn_jeans", "torn_pants"], "stage3_selected_scores": {"claws": 0.5504, "torn_bottomwear": 0.4254, "guitar": 0.9788, "torn_pants": 0.4559, "playing_guitar": 0.9494, "torn_jeans": 0.4784, "electric_guitar": 0.8829, "bass_guitar": 0.9286, "flowing_hair": 0.5466, "pastel_background": 0.5453}, "stage3_selected_ranks": {"claws": 11, "torn_bottomwear": 37, "guitar": 1, "torn_pants": 30, "playing_guitar": 2, "torn_jeans": 24, "electric_guitar": 5, "bass_guitar": 3, "flowing_hair": 13, "pastel_background": 14}, "stage3_selected_phrase_ranks": {"claws": 1, "torn_bottomwear": 3, "guitar": 1, "torn_pants": 2, "playing_guitar": 1, "torn_jeans": 1, "electric_guitar": 4, "bass_guitar": 2, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "electric_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8829}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5466}, "jeans": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5453}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9494}, "playing_music": {"source": "implied"}, "torn_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4254}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4784}, "torn_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4559}}, "structural": ["solo", "anthro", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 2.07, "t2": 0.98, "t3": 4.74, "t3s": 1.05, "t3p": 2.05, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=48 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 71, "n_selected": 12, "n_implied": 6, "n_structural": 4, "n_probe": 3, "ret_R": 0.56, "P": 0.5, "R": 0.24, "F1": 0.3243, "leaf_P": 0.5, "leaf_R": 0.2, "leaf_F1": 0.2857, "n_leaf_sel": 6, "n_leaf_gt": 15, "ret_P": 0.1972, "sel_given_ret": 0.4286, "over_sel": 0.48, "why": {"explicit": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 1, "dupe_indices_total": 0, "kept_total": 43, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "11": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.24, "gen_F1": 0.3243, "missed": ["bottomwear", "canid", "canine", "claws", "crossed_arms", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "standing"], "extra": ["blue_clothing", "blue_topwear", "grey_clothing", "grey_shirt", "grey_topwear", "looking_at_viewer"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "blue_clothing", "blue_topwear", "clothed", "clothing", "duo", "grey_clothing", "grey_shirt", "grey_topwear", "looking_at_viewer", "shirt", "topwear"], "stage3_selected": ["blue_topwear", "grey_shirt"], "stage3_selected_scores": {"blue_topwear": 0.6595, "grey_shirt": 0.6862}, "stage3_selected_ranks": {"blue_topwear": 21, "grey_shirt": 16}, "stage3_selected_phrase_ranks": {"blue_topwear": 4, "grey_shirt": 4}, "extra_evidence": {"blue_clothing": {"source": "implied"}, "blue_topwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6595}, "grey_clothing": {"source": "implied"}, "grey_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6862}, "grey_topwear": {"source": "implied"}, "looking_at_viewer": {"source": "structural"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 1.76, "t2": 1.7, "t3": 10.45, "t3s": 0.81, "t3p": 2.01, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=71 entity=2 copyright_filtered=0 generic_char_to_general=2 unknown_type=3"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 81, "n_selected": 26, "n_implied": 8, "n_structural": 4, "n_probe": 3, "ret_R": 0.5455, "P": 0.3077, "R": 0.7273, "F1": 0.4324, "leaf_P": 0.25, "leaf_R": 0.4286, "leaf_F1": 0.3158, "n_leaf_sel": 12, "n_leaf_gt": 7, "ret_P": 0.0741, "sel_given_ret": 1.3333, "over_sel": 2.36, "why": {"explicit": 13, "strong_implied": 1}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 2, "calls_with_selection": 2, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 2, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 49, "attempts_by_n_local": {"60": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}, "26": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3077, "gen_R": 0.7273, "gen_F1": 0.4324, "missed": ["blue_eyes", "blue_nose", "open_mouth"], "extra": ["animal_humanoid", "anthro", "blue_eyebrows", "blue_stripes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "eyebrows", "fox_humanoid", "humanoid", "jumper", "male", "mammal_humanoid", "pink_stripes", "pink_tail", "stripes", "tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anthro", "blue_eyebrows", "blue_stripes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "eyebrows", "fox_humanoid", "fur", "humanoid", "jumper", "male", "mammal", "mammal_humanoid", "pink_stripes", "pink_tail", "purple_body", "solo", "stripes", "tail", "white_body", "white_fur"], "stage3_selected": ["blue_eyebrows", "blue_stripes", "canid_humanoid", "canine_humanoid", "fox_humanoid", "fur", "jumper", "pink_stripes", "pink_tail", "purple_body", "simple_background", "stripes", "tail", "white_fur"], "stage3_selected_scores": {"fur": 0.5666, "simple_background": 0.5782, "tail": 0.5897, "white_fur": 0.5773, "stripes": 0.578, "purple_body": 0.5476, "canid_humanoid": 0.8744, "canine_humanoid": 0.9128, "pink_tail": 0.5166, "blue_stripes": 0.5367, "blue_eyebrows": 0.4546, "pink_stripes": 0.5444, "jumper": 0.4005, "fox_humanoid": 0.8327}, "stage3_selected_ranks": {"fur": 22, "simple_background": 19, "tail": 14, "white_fur": 21, "stripes": 20, "purple_body": 29, "canid_humanoid": 2, "canine_humanoid": 1, "pink_tail": 42, "blue_stripes": 35, "blue_eyebrows": 64, "pink_stripes": 34, "jumper": 74, "fox_humanoid": 4}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "tail": 1, "white_fur": 1, "stripes": 1, "purple_body": 1, "canid_humanoid": 2, "canine_humanoid": 1, "pink_tail": 1, "blue_stripes": 2, "blue_eyebrows": 2, "pink_stripes": 1, "jumper": 2, "fox_humanoid": 4}, "extra_evidence": {"animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "blue_eyebrows": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4546}, "blue_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5367}, "canid_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8744}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9128}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "eyebrows": {"source": "implied"}, "fox_humanoid": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8327}, "humanoid": {"source": "implied"}, "jumper": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4005}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5444}, "pink_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5166}, "stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.578}, "tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5897}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 1.63, "t2": 1.69, "t3": 4.7, "t3s": 0.61, "t3p": 3.1, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=86 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
data/eval_results/latency_single_shot_seed42.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T05:57:34.402565", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "single_shot", "chunk_size": 60, "eval_path": "data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 10, "temperature": 0.0, "shuffle": false, "seed": 42, "workers": 4, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 43}
2
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 153, "n_selected": 8, "n_implied": 1, "n_structural": 4, "n_probe": 5, "ret_R": 0.3333, "P": 0.875, "R": 0.5833, "F1": 0.7, "leaf_P": 0.6667, "leaf_R": 0.4444, "leaf_F1": 0.5333, "n_leaf_sel": 6, "n_leaf_gt": 9, "ret_P": 0.0261, "sel_given_ret": 1.75, "over_sel": 0.67, "why": {}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 0, "calls_exhausted_retries": 1, "attempts_total": 3, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 0, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"156": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}}, "attempt_failure_rate": 1.0, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.875, "gen_R": 0.5833, "gen_F1": 0.7, "missed": ["alpha_channel", "feline", "fingers", "fur", "hair"], "extra": ["text"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "clothed", "clothing", "felid", "male", "mammal", "solo", "text"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"text": {"source": "probe"}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 3.18, "t2": 1.54, "t3": 60.6, "t3s": 5.79, "t3p": 8.12, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=156 entity=1 copyright_filtered=1 generic_char_to_general=0 unknown_type=2", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"strong_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"strong_implied\"}, {\"i\": 22, \"why\": \"strong_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"style_or_meta\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"explicit\"}, {\"i\": 42, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"explicit\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"style_or_meta\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"explicit\"}, {\"i\": 74, \"why\": \"explicit\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 78}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 78}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"strong_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"strong_implied\"}, {\"i\": 22, \"why\": \"strong_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"style_or_meta\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 39, \"why\": \"explicit\"}, {\"i\": 42, \"why\": \"other\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"style_or_meta\"}, {\"i\": 56, \"why\": \"explicit\"}, {\"i\": 58, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"other\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"explicit\"}, {\"i\": 74, \"why\": \"explicit\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"style_or_meta\"}, {\"i\": 82, \"why\": \"weak_implied\"}, {\"i\": 84}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 84}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"explicit\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"explicit\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"weak_implied\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"explicit\"}, {\"i\": 57, \"why\": \"explicit\"}, {\"i\": 58, \"why\": \"explicit\"}, {\"i\": 59, \"why\": \"explicit\"}, {\"i\": 62, \"why\": \"explicit\"}, {\"i\": 65, \"why\": \"other\"}, {\"i\": 67, \"why\": \"other\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 70, \"why\": \"other\"}, {\"i\": 72, \"why\": \"explicit\"}, {\"i\": 73, \"why\": \"explicit\"}, {\"i\": 74, \"why\": \"explicit\"}, {\"i\": 76, \"why\": \"explicit\"}, {\"i\": 77, \"why\": \"explicit\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"weak_implied\"}, {\"i\": 84, \"why\": \"other\"}, {\"i\": 87, \"why\": \"weak_implied\"}, {\"i\": 88, \"why\": \"other\"}, {\"i\": 90, \"why\": \"explicit\"}, {\"i\": 91, \"why\": \"weak_implied\"}, {\"i\": 93}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.37.why\n Field required [type=missing, input_value={'i': 93}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: gave up after 3 attempts"]}
3
+ {"id": 260449, "n_gt": 14, "n_retrieved": 160, "n_selected": 11, "n_implied": 1, "n_structural": 6, "n_probe": 6, "ret_R": 0.5714, "P": 0.5455, "R": 0.4286, "F1": 0.48, "leaf_P": 0.2857, "leaf_R": 0.2, "leaf_F1": 0.2353, "n_leaf_sel": 7, "n_leaf_gt": 10, "ret_P": 0.05, "sel_given_ret": 0.75, "over_sel": 0.79, "why": {}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 0, "calls_exhausted_retries": 1, "attempts_total": 3, "attempt_errors": 2, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"161": {"attempts": 3, "parse_ok": 1, "parse_fail": 0, "errors": 2}}, "attempt_failure_rate": 0.6666666666666666, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5455, "gen_R": 0.4286, "gen_F1": 0.48, "missed": ["ape", "dancing", "fur", "hair", "haplorhine", "human", "male", "primate"], "extra": ["anthro", "duo", "humanoid", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "bear", "clothed", "clothing", "duo", "group", "humanoid", "looking_at_viewer", "mammal", "topless", "trio"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"anthro": {"source": "structural"}, "duo": {"source": "probe"}, "humanoid": {"source": "structural"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "humanoid", "clothed", "topless", "looking_at_viewer"], "probe": ["clothing", "simple_background", "anthro", "duo", "group", "bear"], "t1": 3.09, "t2": 1.9, "t3": 11.5, "t3s": 3.64, "t3p": 8.74, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=161 entity=5 copyright_filtered=2 generic_char_to_general=1 unknown_type=3", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"style_or_meta\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"style_or_meta\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"explicit\"}, {\"i\": 71, \"why\": \"style_or_meta\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 75, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 81, \"why\": \"weak_implied\"}, {\"i\": 83, \"why\": \"style_or_meta\"}, {\"i\": 85, \"why\": \"weak_implied\"}, {\"i\": 87, \"why\": \"weak_implied\"}, {\"i\": 89, \"why\": \"style_or_meta\"}, {\"i\": 91, \"why\": \"weak_implied\"}, {\"i\": 93, \"why\": \"weak_implied\"}, {\"i\": 95}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 95}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"style_or_meta\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"style_or_meta\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 31, \"why\": \"other\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"weak_implied\"}, {\"i\": 41, \"why\": \"weak_implied\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"style_or_meta\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 69}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: gave up after 3 attempts"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 123, "n_selected": 27, "n_implied": 9, "n_structural": 5, "n_probe": 4, "ret_R": 0.7143, "P": 0.5185, "R": 1.0, "F1": 0.6829, "leaf_P": 0.5, "leaf_R": 0.8889, "leaf_F1": 0.64, "n_leaf_sel": 16, "n_leaf_gt": 9, "ret_P": 0.0813, "sel_given_ret": 1.4, "over_sel": 1.93, "why": {"explicit": 9, "strong_implied": 2}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 35, "attempts_by_n_local": {"121": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}}, "attempt_failure_rate": 0.5, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5185, "gen_R": 1.0, "gen_F1": 0.6829, "missed": [], "extra": ["<3", "coat", "holding_object", "holding_plushie", "looking_at_viewer", "love", "male", "raincoat", "topwear", "vest", "white_clothing", "white_coat", "white_topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "holding_object", "holding_plushie", "lagomorph", "leporid", "looking_at_viewer", "love", "male", "mammal", "plushie", "rabbit", "raincoat", "romantic", "romantic_couple", "teal_eyes", "topwear", "vest", "white_clothing", "white_coat", "white_topwear"], "stage3_selected": ["blue_eyes", "coat", "holding_plushie", "love", "plushie", "rabbit", "raincoat", "romantic_couple", "teal_eyes", "vest", "white_coat"], "stage3_selected_scores": {"blue_eyes": 0.4246, "rabbit": 0.5842, "coat": 0.6315, "plushie": 0.6566, "vest": 0.4922, "teal_eyes": 0.6344, "holding_plushie": 0.5459, "raincoat": 0.5029, "white_coat": 0.5129, "romantic_couple": 0.5616, "love": 0.4648}, "stage3_selected_ranks": {"blue_eyes": 86, "rabbit": 9, "coat": 5, "plushie": 3, "vest": 47, "teal_eyes": 4, "holding_plushie": 17, "raincoat": 32, "white_coat": 27, "romantic_couple": 13, "love": 56}, "stage3_selected_phrase_ranks": {"blue_eyes": 6, "rabbit": 1, "coat": 1, "plushie": 1, "vest": 9, "teal_eyes": 1, "holding_plushie": 2, "raincoat": 5, "white_coat": 4, "romantic_couple": 1, "love": 5}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6315}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5459}, "looking_at_viewer": {"source": "structural"}, "love": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4648}, "male": {"source": "structural"}, "raincoat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5029}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4922}, "white_clothing": {"source": "implied"}, "white_coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5129}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "blush", "duo", "<3"], "t1": 2.97, "t2": 3.17, "t3": 56.28, "t3s": 4.35, "t3p": 3.9, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=121 entity=3 copyright_filtered=0 generic_char_to_general=0 unknown_type=2", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 37, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"style_or_meta\"}, {\"i\": 40, \"why\": \"weak_implied\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 50, \"why\": \"weak_implied\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"weak_implied\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 73, \"why\": \"weak_implied\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.34.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.34.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
5
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 66, "n_selected": 9, "n_implied": 0, "n_structural": 5, "n_probe": 3, "ret_R": 0.0, "P": 0.1111, "R": 0.25, "F1": 0.1538, "leaf_P": 0.1111, "leaf_R": 0.25, "leaf_F1": 0.1538, "n_leaf_sel": 9, "n_leaf_gt": 4, "ret_P": 0.0, "sel_given_ret": 0.0, "over_sel": 2.25, "why": {"explicit": 5}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"62": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1111, "gen_R": 0.25, "gen_F1": 0.1538, "missed": ["red_nose", "smile", "tan_body"], "extra": ["ambiguous_gender", "anthro", "feral", "looking_at_viewer", "nude", "sky_background", "toony", "wide_eyed"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "feral", "looking_at_viewer", "nude", "sky_background", "solo", "toony", "wide_eyed"], "stage3_selected": ["simple_background", "sky_background", "toony", "white_background", "wide_eyed"], "stage3_selected_scores": {"simple_background": 0.5582, "white_background": 0.5301, "toony": 0.5337, "wide_eyed": 0.4535, "sky_background": 0.5476}, "stage3_selected_ranks": {"simple_background": 5, "white_background": 11, "toony": 9, "wide_eyed": 29, "sky_background": 6}, "stage3_selected_phrase_ranks": {"simple_background": 1, "white_background": 8, "toony": 2, "wide_eyed": 1, "sky_background": 4}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "feral": {"source": "structural"}, "looking_at_viewer": {"source": "structural"}, "nude": {"source": "structural"}, "sky_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5476}, "toony": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5337}, "wide_eyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4535}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["anthro", "simple_background", "solo"], "t1": 31.72, "t2": 0.61, "t3": 1.85, "t3s": 5.36, "t3p": 4.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "LLM rewrite: fallback (error: ReadTimeout: The read operation timed out)", "Stage3 split: general=62 entity=1 copyright_filtered=9 generic_char_to_general=0 unknown_type=1"]}
6
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 192, "n_selected": 8, "n_implied": 2, "n_structural": 5, "n_probe": 3, "ret_R": 0.4091, "P": 0.875, "R": 0.3182, "F1": 0.4667, "leaf_P": 0.1667, "leaf_R": 0.0833, "leaf_F1": 0.1111, "n_leaf_sel": 6, "n_leaf_gt": 12, "ret_P": 0.0469, "sel_given_ret": 0.7778, "over_sel": 0.36, "why": {}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 0, "calls_exhausted_retries": 1, "attempts_total": 3, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 0, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"193": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}}, "attempt_failure_rate": 1.0, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.875, "gen_R": 0.3182, "gen_F1": 0.4667, "missed": ["blue_eyes", "bottomwear", "chest_tuft", "countershading", "fur", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "stripes", "tiger", "topless", "tuft"], "extra": ["looking_at_viewer"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "clothed", "clothing", "felid", "looking_at_viewer", "male", "mammal", "solo"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"looking_at_viewer": {"source": "structural"}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "felid", "solo"], "t1": 1.93, "t2": 1.89, "t3": 38.4, "t3s": 0.87, "t3p": 1.34, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=193 entity=1 copyright_filtered=2 generic_char_to_general=0 unknown_type=3", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"strong_implied\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"style_or_meta\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"style_or_meta\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"style_or_meta\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 41, \"why\": \"explicit\"}, {\"i\": 43, \"why\": \"other\"}, {\"i\": 46, \"why\": \"explicit\"}, {\"i\": 51, \"why\": \"style_or_meta\"}, {\"i\": 55, \"why\": \"style_or_meta\"}, {\"i\": 57, \"why\": \"other\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"style_or_meta\"}, {\"i\": 68, \"why\": \"other\"}, {\"i\": 71, \"why\": \"weak_implied\"}, {\"i\": 74, \"why\": \"style_or_meta\"}, {\"i\": 77, \"why\": \"weak_implied\"}, {\"i\": 81, \"why\": \"other\"}, {\"i\": 84, \"why\": \"weak_implied\"}, {\"i\": 87, \"why\": \"strong_implied\"}, {\"i\": 91, \"why\": \"explicit\"}, {\"i\": 94, \"why\": \"weak_implied\"}, {\"i\": 97, \"why\": \"style_or_meta\"}, {\"i\": 100, \"why\": \"weak_implied\"}, {\"i\": 103, \"why\": \"other\"}, {\"i\": 106, \"why\": \"weak_implied\"}, {\"i\": 109, \"why\": \"strong_implied\"}, {\"i\": 113, \"why\": \"other\"}, {\"i\": 116, \"why\": \"weak_implied\"}, {\"i\": 119}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 119}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 25, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"explicit\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 37, \"why\": \"explicit\"}, {\"i\": 39, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"explicit\"}, {\"i\": 43, \"why\": \"explicit\"}, {\"i\": 45, \"why\": \"explicit\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 51, \"why\": \"explicit\"}, {\"i\": 53, \"why\": \"explicit\"}, {\"i\": 55, \"why\": \"explicit\"}, {\"i\": 57, \"why\": \"explicit\"}, {\"i\": 59, \"why\": \"explicit\"}, {\"i\": 61, \"why\": \"explicit\"}, {\"i\": 63, \"why\": \"explicit\"}, {\"i\": 65, \"why\": \"explicit\"}, {\"i\": 67, \"why\": \"explicit\"}, {\"i\": 69, \"why\": \"explicit\"}, {\"i\": 71, \"why\": \"explicit\"}, {\"i\": 73, \"why\": \"explicit\"}, {\"i\": 75, \"why\": \"explicit\"}, {\"i\": 77, \"why\": \"explicit\"}, {\"i\": 79, \"why\": \"explicit\"}, {\"i\": 81, \"why\": \"explicit\"}, {\"i\": 83, \"why\": \"explicit\"}, {\"i\": 85, \"why\": \"explicit\"}, {\"i\": 87, \"why\": \"explicit\"}, {\"i\": 89, \"why\": \"explicit\"}, {\"i\": 91, \"why\": \"explicit\"}, {\"i\": 93}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.38.why\n Field required [type=missing, input_value={'i': 93}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"explicit\"}, {\"i\": 68, \"why\": \"weak_implied\"}, {\"i\": 71, \"why\": \"explicit\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 87, \"why\": \"strong_implied\"}, {\"i\": 91, \"why\": \"weak_implied\"}, {\"i\": 95, \"why\": \"weak_implied\"}, {\"i\": 97, \"why\": \"weak_implied\"}, {\"i\": 109, \"why\": \"strong_implied\"}, {\"i\": 113, \"why\": \"other\"}, {\"i\": 115, \"why\": \"weak_implied\"}, {\"i\": 119, \"why\": \"style_or_meta\"}, {\"i\": 128, \"why\": \"weak_implied\"}, {\"i\": 131, \"why\": \"weak_implied\"}, {\"i\": 137, \"why\": \"weak_implied\"}, {\"i\": 141, \"why\": \"weak_implied\"}, {\"i\": 145, \"why\": \"weak_implied\"}, {\"i\": 149, \"why\": \"weak_implied\"}, {\"i\": 153, \"why\": \"style_or_meta\"}, {\"i\": 157, \"why\": \"weak_implied\"}, {\"i\": 161, \"why\": \"weak_implied\"}, {\"i\": 165, \"why\": \"weak_implied\"}, {\"i\": 169, \"why\": \"weak_implied\"}, {\"i\": 173, \"why\": \"weak_implied\"}, {\"i\": 177, \"why\": \"weak_implied\"}, {\"i\": 181, \"why\": \"weak_implied\"}, {\"i\": 185, \"why\": \"weak_implied\"}, {\"i\": 189, \"why\": \"weak_implied\"}, {\"i\": 193}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 193}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: gave up after 3 attempts"]}
7
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 177, "n_selected": 7, "n_implied": 0, "n_structural": 5, "n_probe": 4, "ret_R": 0.6923, "P": 0.1429, "R": 0.0769, "F1": 0.1, "leaf_P": 0.1667, "leaf_R": 0.1667, "leaf_F1": 0.1667, "n_leaf_sel": 6, "n_leaf_gt": 6, "ret_P": 0.0508, "sel_given_ret": 0.1111, "over_sel": 0.54, "why": {}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 0, "calls_exhausted_retries": 1, "attempts_total": 3, "attempt_errors": 3, "attempt_parse_fail": 0, "attempt_parse_ok": 0, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"168": {"attempts": 3, "parse_ok": 0, "parse_fail": 0, "errors": 3}}, "attempt_failure_rate": 1.0, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1429, "gen_R": 0.0769, "gen_F1": 0.1, "missed": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "white_body", "white_fur"], "extra": ["anthro", "clothed", "clothing", "group", "intersex", "taur"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "clothed", "clothing", "group", "intersex", "taur", "text"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"anthro": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "group": {"source": "structural"}, "intersex": {"source": "structural"}, "taur": {"source": "structural"}}, "structural": ["group", "taur", "intersex", "clothed", "text"], "probe": ["clothing", "simple_background", "anthro", "text"], "t1": 2.38, "t2": 1.56, "t3": 37.47, "t3s": 2.28, "t3p": 1.41, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=168 entity=3 copyright_filtered=6 generic_char_to_general=0 unknown_type=2", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"weak_implied\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"weak_implied\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"weak_implied\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"weak_implied\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"weak_implied\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.34.why\n Field required [type=missing, input_value={'i': 35}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 2, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"weak_implied\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 15, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 17, \"why\": \"weak_implied\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"weak_implied\"}, {\"i\": 20, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"weak_implied\"}, {\"i\": 22, \"why\": \"weak_implied\"}, {\"i\": 23, \"why\": \"weak_implied\"}, {\"i\": 24, \"why\": \"explicit\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 36}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: gave up after 3 attempts"]}
8
+ {"id": 335343, "n_gt": 15, "n_retrieved": 208, "n_selected": 18, "n_implied": 1, "n_structural": 3, "n_probe": 3, "ret_R": 0.6667, "P": 0.4444, "R": 0.5333, "F1": 0.4848, "leaf_P": 0.4, "leaf_R": 0.5, "leaf_F1": 0.4444, "n_leaf_sel": 15, "n_leaf_gt": 12, "ret_P": 0.0481, "sel_given_ret": 0.8, "over_sel": 1.2, "why": {"explicit": 14}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 2, "attempt_errors": 1, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 36, "attempts_by_n_local": {"211": {"attempts": 2, "parse_ok": 1, "parse_fail": 0, "errors": 1}}, "attempt_failure_rate": 0.5, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4444, "gen_R": 0.5333, "gen_F1": 0.4848, "missed": ["angry", "bed", "eyes_closed", "furniture", "green_eyes", "lying", "sleeping"], "extra": ["annoyed", "annoyed_expression", "anthro", "bed_covers", "bedding", "bedroom", "blanket", "expressions", "eyes", "humanoid"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed", "annoyed_expression", "anthro", "bed_covers", "bedding", "bedroom", "blanket", "blonde_hair", "blue_eyes", "duo", "expressions", "eyes", "eyeshadow", "hair", "humanoid", "makeup", "purple_hair", "text"], "stage3_selected": ["annoyed", "annoyed_expression", "bed_covers", "bedroom", "blanket", "blonde_hair", "blue_eyes", "expressions", "eyes", "eyeshadow", "hair", "makeup", "purple_hair", "text"], "stage3_selected_scores": {"hair": 0.6031, "text": 0.6007, "blue_eyes": 0.6014, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "eyeshadow": 0.4763, "bedroom": 0.4901, "annoyed": 0.5727, "blanket": 0.4205, "annoyed_expression": 0.7251, "bed_covers": 0.4145, "expressions": 0.5439, "eyes": 0.8951}, "stage3_selected_ranks": {"hair": 5, "text": 8, "blue_eyes": 7, "blonde_hair": 10, "purple_hair": 14, "makeup": 11, "eyeshadow": 53, "bedroom": 43, "annoyed": 13, "blanket": 98, "annoyed_expression": 2, "bed_covers": 108, "expressions": 18, "eyes": 1}, "stage3_selected_phrase_ranks": {"hair": 1, "text": 1, "blue_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "eyeshadow": 3, "bedroom": 1, "annoyed": 2, "blanket": 7, "annoyed_expression": 1, "bed_covers": 3, "expressions": 3, "eyes": 1}, "extra_evidence": {"annoyed": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5727}, "annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "bed_covers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4145}, "bedding": {"source": "implied"}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "blanket": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4205}, "expressions": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5439}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8951}, "humanoid": {"source": "structural"}}, "structural": ["duo", "humanoid", "text"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.3, "t2": 1.92, "t3": 28.93, "t3s": 0.87, "t3p": 7.59, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=211 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 2, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"style_or_meta\"}, {\"i\": 20, \"why\": \"style_or_meta\"}, {\"i\": 24, \"why\": \"explicit\"}, {\"i\": 26, \"why\": \"weak_implied\"}, {\"i\": 28, \"why\": \"explicit\"}, {\"i\": 30, \"why\": \"explicit\"}, {\"i\": 33, \"why\": \"explicit\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 39, \"why\": \"explicit\"}, {\"i\": 41, \"why\": \"style_or_meta\"}, {\"i\": 43, \"why\": \"style_or_meta\"}, {\"i\": 45, \"why\": \"style_or_meta\"}, {\"i\": 47, \"why\": \"explicit\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"style_or_meta\"}, {\"i\": 53, \"why\": \"weak_implied\"}, {\"i\": 55, \"why\": \"weak_implied\"}, {\"i\": 57, \"why\": \"weak_implied\"}, {\"i\": 59, \"why\": \"weak_implied\"}, {\"i\": 61, \"why\": \"weak_implied\"}, {\"i\": 63, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"style_or_meta\"}, {\"i\": 71, \"why\": \"style_or_meta\"}, {\"i\": 73, \"why\": \"style_or_meta\"}, {\"i\": 75}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.35.why\n Field required [type=missing, input_value={'i': 75}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
9
+ {"id": 17482, "n_gt": 22, "n_retrieved": 106, "n_selected": 40, "n_implied": 17, "n_structural": 3, "n_probe": 3, "ret_R": 0.3636, "P": 0.375, "R": 0.6818, "F1": 0.4839, "leaf_P": 0.25, "leaf_R": 0.3077, "leaf_F1": 0.2759, "n_leaf_sel": 16, "n_leaf_gt": 13, "ret_P": 0.0755, "sel_given_ret": 1.875, "over_sel": 1.82, "why": {"explicit": 18, "strong_implied": 1}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 2, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 28, "attempts_by_n_local": {"109": {"attempts": 3, "parse_ok": 1, "parse_fail": 0, "errors": 2}}, "attempt_failure_rate": 0.6666666666666666, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.375, "gen_R": 0.6818, "gen_F1": 0.4839, "missed": ["fingers", "fur", "holding_musical_instrument", "holding_object", "music", "spade_tail", "tail"], "extra": ["action_pose", "blonde_hair", "bottomwear", "canis", "crosslegged_pose", "denim", "denim_clothing", "electric_guitar", "finger_claws", "flowing_hair", "jeans", "pants", "pastel_background", "playing_guitar", "playing_music", "pose", "shirt", "topwear", "torn_body", "torn_bottomwear", "torn_jeans", "torn_pants", "torn_shirt", "torn_topwear", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["action_pose", "anthro", "bass_guitar", "blonde_hair", "bottomwear", "canid", "canine", "canis", "claws", "clothed", "clothing", "crosslegged_pose", "denim", "denim_clothing", "electric_guitar", "finger_claws", "flowing_hair", "guitar", "hair", "jeans", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "pose", "shirt", "solo", "string_instrument", "topwear", "torn_body", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "torn_shirt", "torn_topwear", "wolf"], "stage3_selected": ["action_pose", "bass_guitar", "blonde_hair", "claws", "crosslegged_pose", "electric_guitar", "finger_claws", "flowing_hair", "guitar", "pastel_background", "playing_guitar", "playing_music", "torn_body", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "torn_shirt", "wolf"], "stage3_selected_scores": {"claws": 0.5504, "wolf": 0.5691, "blonde_hair": 0.3645, "torn_clothing": 0.3951, "finger_claws": 0.422, "torn_bottomwear": 0.4254, "guitar": 0.9788, "torn_pants": 0.4559, "playing_music": 0.8891, "torn_shirt": 0.3906, "playing_guitar": 0.9494, "torn_jeans": 0.4784, "electric_guitar": 0.8829, "bass_guitar": 0.9286, "flowing_hair": 0.5466, "crosslegged_pose": 0.445, "torn_body": 0.388, "pastel_background": 0.5453, "action_pose": 0.5685}, "stage3_selected_ranks": {"claws": 17, "wolf": 12, "blonde_hair": 101, "torn_clothing": 83, "finger_claws": 65, "torn_bottomwear": 64, "guitar": 1, "torn_pants": 43, "playing_music": 4, "torn_shirt": 85, "playing_guitar": 2, "torn_jeans": 30, "electric_guitar": 5, "bass_guitar": 3, "flowing_hair": 19, "crosslegged_pose": 49, "torn_body": 87, "pastel_background": 20, "action_pose": 13}, "stage3_selected_phrase_ranks": {"claws": 1, "wolf": 1, "blonde_hair": 6, "torn_clothing": 6, "finger_claws": 5, "torn_bottomwear": 3, "guitar": 1, "torn_pants": 2, "playing_music": 3, "torn_shirt": 7, "playing_guitar": 1, "torn_jeans": 1, "electric_guitar": 4, "bass_guitar": 2, "flowing_hair": 1, "crosslegged_pose": 6, "torn_body": 8, "pastel_background": 1, "action_pose": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5685}, "blonde_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3645}, "bottomwear": {"source": "implied"}, "canis": {"source": "implied"}, "crosslegged_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.445}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "electric_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8829}, "finger_claws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.422}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5466}, "jeans": {"source": "implied"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5453}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9494}, "playing_music": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8891}, "pose": {"source": "implied"}, "shirt": {"source": "implied"}, "topwear": {"source": "implied"}, "torn_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.388}, "torn_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4254}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4784}, "torn_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4559}, "torn_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3906}, "torn_topwear": {"source": "implied"}, "wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5691}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.21, "t2": 1.05, "t3": 33.71, "t3s": 1.77, "t3p": 4.53, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=109 entity=2 copyright_filtered=0 generic_char_to_general=0 unknown_type=2", "Stage3 general_single_shot: attempt 1 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 3, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 35, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"explicit\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"other\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"explicit\"}, {\"i\": 60, \"why\": \"explicit\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 67, \"why\": \"weak_implied\"}, {\"i\": 69, \"why\": \"style_or_meta\"}, {\"i\": 71, \"why\": \"explicit\"}, {\"i\": 73, \"why\": \"other\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"explicit\"}, {\"i\": 85, \"why\": \"weak_implied\"}, {\"i\": 87, \"why\": \"other\"}, {\"i\": 90, \"why\": \"style_or_meta\"}, {\"i\": 92, \"why\": \"explicit\"}, {\"i\": 94, \"why\": \"weak_implied\"}, {\"i\": 96}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 96}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 3, \"why\": \"explicit\"}, {\"i\": 6, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 13, \"why\": \"explicit\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"weak_implied\"}, {\"i\": 21, \"why\": \"explicit\"}, {\"i\": 24, \"why\": \"weak_implied\"}, {\"i\": 26, \"why\": \"explicit\"}, {\"i\": 27, \"why\": \"explicit\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"explicit\"}, {\"i\": 36, \"why\": \"weak_implied\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"other\"}, {\"i\": 43, \"why\": \"weak_implied\"}, {\"i\": 46, \"why\": \"explicit\"}, {\"i\": 48, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"other\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 58, \"why\": \"explicit\"}, {\"i\": 60, \"why\": \"explicit\"}, {\"i\": 62, \"why\": \"weak_implied\"}, {\"i\": 65, \"why\": \"weak_implied\"}, {\"i\": 68, \"why\": \"explicit\"}, {\"i\": 70, \"why\": \"weak_implied\"}, {\"i\": 72, \"why\": \"explicit\"}, {\"i\": 74, \"why\": \"weak_implied\"}, {\"i\": 76, \"why\": \"weak_implied\"}, {\"i\": 80, \"why\": \"weak_implied\"}, {\"i\": 82, \"why\": \"explicit\"}, {\"i\": 84, \"why\": \"weak_implied\"}, {\"i\": 86, \"why\": \"weak_implied\"}, {\"i\": 88, \"why\": \"other\"}, {\"i\": 90}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 90}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE "]}
10
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 161, "n_selected": 5, "n_implied": 1, "n_structural": 4, "n_probe": 3, "ret_R": 0.64, "P": 0.8, "R": 0.16, "F1": 0.2667, "leaf_P": 0.75, "leaf_R": 0.2, "leaf_F1": 0.3158, "n_leaf_sel": 4, "n_leaf_gt": 15, "ret_P": 0.0994, "sel_given_ret": 0.25, "over_sel": 0.2, "why": {}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 0, "calls_exhausted_retries": 1, "attempts_total": 3, "attempt_errors": 2, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 0, "attempts_by_n_local": {"161": {"attempts": 3, "parse_ok": 1, "parse_fail": 0, "errors": 2}}, "attempt_failure_rate": 0.6666666666666666, "call_exhaustion_rate": 1.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.8, "gen_R": 0.16, "gen_F1": 0.2667, "missed": ["bottomwear", "canid", "canine", "claws", "crossed_arms", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "extra": ["looking_at_viewer"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "clothed", "clothing", "duo", "looking_at_viewer"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"looking_at_viewer": {"source": "structural"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.22, "t2": 1.6, "t3": 26.78, "t3s": 0.46, "t3p": 8.01, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=161 entity=5 copyright_filtered=0 generic_char_to_general=0 unknown_type=3", "Stage3 general_single_shot: attempt 2 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"weak_implied\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 4, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"weak_implied\"}, {\"i\": 7, \"why\": \"weak_implied\"}, {\"i\": 8, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"weak_implied\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 11, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 13, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 17, \"why\": \"other\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 19, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 21, \"why\": \"other\"}, {\"i\": 23, \"why\": \"other\"}, {\"i\": 25, \"why\": \"weak_implied\"}, {\"i\": 27, \"why\": \"weak_implied\"}, {\"i\": 29, \"why\": \"weak_implied\"}, {\"i\": 31, \"why\": \"weak_implied\"}, {\"i\": 33, \"why\": \"other\"}, {\"i\": 35, \"why\": \"other\"}, {\"i\": 37, \"why\": \"other\"}, {\"i\": 39, \"why\": \"other\"}, {\"i\": 41, \"why\": \"other\"}, {\"i\": 43, \"why\": \"explicit\"}, {\"i\": 45, \"why\": \"weak_implied\"}, {\"i\": 47, \"why\": \"weak_implied\"}, {\"i\": 49, \"why\": \"weak_implied\"}, {\"i\": 51, \"why\": \"other\"}, {\"i\": 53, \"why\": \"other\"}, {\"i\": 55, \"why\": \"other\"}, {\"i\": 57, \"why\": \"other\"}, {\"i\": 59}]}. Got: 1 validation error for Stage3SelectionResponse\nselections.36.why\n Field required [type=missing, input_value={'i': 59}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: attempt 3 error: Failed to parse Stage3SelectionResponse from completion {\"selections\": [{\"i\": 1, \"why\": \"explicit\"}, {\"i\": 3, \"why\": \"explicit\"}, {\"i\": 5, \"why\": \"explicit\"}, {\"i\": 7, \"why\": \"explicit\"}, {\"i\": 9, \"why\": \"explicit\"}, {\"i\": 10, \"why\": \"explicit\"}, {\"i\": 12, \"why\": \"weak_implied\"}, {\"i\": 14, \"why\": \"explicit\"}, {\"i\": 16, \"why\": \"explicit\"}, {\"i\": 18, \"why\": \"explicit\"}, {\"i\": 20, \"why\": \"explicit\"}, {\"i\": 22, \"why\": \"other\"}, {\"i\": 24, \"why\": \"explicit\"}, {\"i\": 26, \"why\": \"other\"}, {\"i\": 28, \"why\": \"other\"}, {\"i\": 30, \"why\": \"other\"}, {\"i\": 32, \"why\": \"weak_implied\"}, {\"i\": 34, \"why\": \"weak_implied\"}, {\"i\": 36, \"why\": \"explicit\"}, {\"i\": 38, \"why\": \"explicit\"}, {\"i\": 40, \"why\": \"explicit\"}, {\"i\": 42, \"why\": \"weak_implied\"}, {\"i\": 44, \"why\": \"explicit\"}, {\"i\": 46, \"why\": \"weak_implied\"}, {\"i\": 48, \"why\": \"other\"}, {\"i\": 50, \"why\": \"other\"}, {\"i\": 52, \"why\": \"weak_implied\"}, {\"i\": 54, \"why\": \"weak_implied\"}, {\"i\": 56, \"why\": \"explicit\"}, {\"i\": 58, \"why\": \"other\"}, {\"i\": 60, \"why\": \"weak_implied\"}, {\"i\": 62, \"why\": \"other\"}, {\"i\": 64, \"why\": \"weak_implied\"}, {\"i\": 66, \"why\": \"other\"}, {\"i\": 68, \"why\": \"other\"}, {\"i\": 70, \"why\": \"other\"}, {\"i\": 72, \"why\": \"weak_implied\"}, {}]}. Got: 2 validation errors for Stage3SelectionResponse\nselections.37.i\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nselections.37.why\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.12/v/missing\nFor troubleshooting, visit: https://docs.langchain.com/oss/python/langchain/errors/OUTPUT_PARSING_FAILURE ", "Stage3 general_single_shot: gave up after 3 attempts"]}
11
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 204, "n_selected": 87, "n_implied": 31, "n_structural": 4, "n_probe": 3, "ret_R": 0.6364, "P": 0.1034, "R": 0.8182, "F1": 0.1837, "leaf_P": 0.0577, "leaf_R": 0.4286, "leaf_F1": 0.1017, "n_leaf_sel": 52, "n_leaf_gt": 7, "ret_P": 0.0343, "sel_given_ret": 1.2857, "over_sel": 7.91, "why": {"explicit": 52}, "stage3_diag": {"mode": "single_shot", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 54, "attempts_by_n_local": {"205": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1034, "gen_R": 0.8182, "gen_F1": 0.1837, "missed": ["blue_eyes", "blue_nose"], "extra": ["2_tails", "abs", "action_pose", "amber_eyes", "animal_humanoid", "anthro", "blue_background", "blue_fingers", "blue_neck", "blue_pawpads", "blue_paws", "blue_toes", "body_hair", "bovid", "bovid_humanoid", "canid_humanoid", "canine_humanoid", "caprine", "caprine_humanoid", "clothed", "clothing", "female_humanoid", "fighting_pose", "goat_humanoid", "grey_nose", "half-closed_eyes", "half_body", "heterochromia", "holding_tail", "humanoid", "inner_ear_fluff", "jagged_mouth", "male", "male_humanoid", "mammal_humanoid", "melee_weapon", "mouth_full", "multi_tail", "muscular", "muscular_male", "narrow_tail", "narrowed_eyes", "nervous_expression", "no_irises", "open_smile", "paw_pose", "pawpads", "pig_humanoid", "pink_ears", "pink_legs", "pink_mouth", "pink_stripes", "pink_tongue", "polearm", "pose", "pupils", "purple_background", "purple_face", "rider", "slim_humanoid", "slit_pupils", "smile", "striped_neck", "stripes", "suid", "suid_humanoid", "suina", "suina_humanoid", "tail", "teeth", "tongue", "trident", "tuft", "two_tone_tail", "wave", "weapon", "white_inner_ear_fluff", "white_tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["2_tails", "abs", "action_pose", "amber_eyes", "animal_humanoid", "anthro", "blue_background", "blue_fingers", "blue_neck", "blue_pawpads", "blue_paws", "blue_toes", "body_hair", "bovid", "bovid_humanoid", "canid", "canid_humanoid", "canine", "canine_humanoid", "caprine", "caprine_humanoid", "clothed", "clothing", "female_humanoid", "fighting_pose", "fur", "goat_humanoid", "grey_nose", "half-closed_eyes", "half_body", "heterochromia", "holding_tail", "humanoid", "inner_ear_fluff", "jagged_mouth", "male", "male_humanoid", "mammal", "mammal_humanoid", "melee_weapon", "mouth_full", "multi_tail", "muscular", "muscular_male", "narrow_tail", "narrowed_eyes", "nervous_expression", "no_irises", "open_mouth", "open_smile", "paw_pose", "pawpads", "pig_humanoid", "pink_ears", "pink_legs", "pink_mouth", "pink_stripes", "pink_tongue", "polearm", "pose", "pupils", "purple_background", "purple_body", "purple_face", "rider", "slim_humanoid", "slit_pupils", "smile", "solo", "striped_neck", "stripes", "suid", "suid_humanoid", "suina", "suina_humanoid", "tail", "teeth", "tongue", "trident", "tuft", "two_tone_tail", "wave", "weapon", "white_body", "white_fur", "white_inner_ear_fluff", "white_tail"], "stage3_selected": ["2_tails", "abs", "action_pose", "amber_eyes", "animal_humanoid", "blue_background", "blue_fingers", "blue_neck", "blue_pawpads", "blue_paws", "blue_toes", "body_hair", "canine_humanoid", "female_humanoid", "fighting_pose", "goat_humanoid", "grey_nose", "half-closed_eyes", "half_body", "heterochromia", "holding_tail", "jagged_mouth", "male_humanoid", "mouth_full", "muscular_male", "narrow_tail", "nervous_expression", "no_irises", "open_smile", "paw_pose", "pig_humanoid", "pink_ears", "pink_legs", "pink_mouth", "pink_stripes", "pink_tongue", "purple_background", "purple_body", "purple_face", "rider", "simple_background", "slim_humanoid", "slit_pupils", "striped_neck", "teeth", "trident", "two_tone_tail", "wave", "white_body", "white_fur", "white_inner_ear_fluff", "white_tail"], "stage3_selected_scores": {"simple_background": 0.5948, "teeth": 0.3603, "white_body": 0.4875, "white_fur": 0.5995, "muscular_male": 0.3102, "abs": 0.3223, "half-closed_eyes": 0.3629, "animal_humanoid": 0.6159, "purple_body": 0.564, "open_smile": 0.4868, "body_hair": 0.305, "slit_pupils": 0.396, "pink_tongue": 0.4215, "blue_background": 0.48, "canine_humanoid": 0.9003, "white_tail": 0.5202, "heterochromia": 0.4423, "two_tone_tail": 0.5197, "amber_eyes": 0.4076, "purple_background": 0.5414, "blue_pawpads": 0.4891, "2_tails": 0.4672, "white_inner_ear_fluff": 0.597, "action_pose": 0.617, "grey_nose": 0.4662, "pink_ears": 0.5255, "holding_tail": 0.5079, "fighting_pose": 0.4593, "wave": 0.3632, "narrow_tail": 0.5074, "paw_pose": 0.5582, "trident": 0.2683, "goat_humanoid": 0.5534, "blue_paws": 0.4986, "no_irises": 0.4008, "pink_stripes": 0.682, "female_humanoid": 0.563, "purple_face": 0.5577, "jagged_mouth": 0.5168, "male_humanoid": 0.5627, "pink_mouth": 0.5127, "striped_neck": 0.5948, "blue_fingers": 0.5077, "blue_toes": 0.5148, "mouth_full": 0.458, "slim_humanoid": 0.588, "rider": 0.2712, "half_body": 0.4115, "pig_humanoid": 0.5894, "blue_neck": 0.5222, "nervous_expression": 0.4772, "pink_legs": 0.5285}, "stage3_selected_ranks": {"simple_background": 36, "teeth": 190, "white_body": 114, "white_fur": 30, "muscular_male": 202, "abs": 199, "half-closed_eyes": 187, "animal_humanoid": 20, "purple_body": 49, "open_smile": 116, "body_hair": 203, "slit_pupils": 178, "pink_tongue": 166, "blue_background": 122, "canine_humanoid": 1, "white_tail": 82, "heterochromia": 158, "two_tone_tail": 83, "amber_eyes": 174, "purple_background": 64, "blue_pawpads": 109, "2_tails": 136, "white_inner_ear_fluff": 32, "action_pose": 19, "grey_nose": 137, "pink_ears": 75, "holding_tail": 92, "fighting_pose": 140, "wave": 186, "narrow_tail": 94, "paw_pose": 56, "trident": 209, "goat_humanoid": 58, "blue_paws": 102, "no_irises": 176, "pink_stripes": 13, "female_humanoid": 50, "purple_face": 57, "jagged_mouth": 86, "male_humanoid": 52, "pink_mouth": 90, "striped_neck": 35, "blue_fingers": 93, "blue_toes": 87, "mouth_full": 143, "slim_humanoid": 38, "rider": 207, "half_body": 172, "pig_humanoid": 37, "blue_neck": 80, "nervous_expression": 127, "pink_legs": 73}, "stage3_selected_phrase_ranks": {"simple_background": 1, "teeth": 7, "white_body": 8, "white_fur": 1, "muscular_male": 8, "abs": 7, "half-closed_eyes": 6, "animal_humanoid": 2, "purple_body": 2, "open_smile": 2, "body_hair": 9, "slit_pupils": 10, "pink_tongue": 3, "blue_background": 7, "canine_humanoid": 1, "white_tail": 7, "heterochromia": 6, "two_tone_tail": 8, "amber_eyes": 9, "purple_background": 5, "blue_pawpads": 10, "2_tails": 10, "white_inner_ear_fluff": 2, "action_pose": 1, "grey_nose": 7, "pink_ears": 4, "holding_tail": 8, "fighting_pose": 5, "wave": 6, "narrow_tail": 9, "paw_pose": 2, "trident": 8, "goat_humanoid": 8, "blue_paws": 9, "no_irises": 8, "pink_stripes": 1, "female_humanoid": 5, "purple_face": 3, "jagged_mouth": 5, "male_humanoid": 7, "pink_mouth": 6, "striped_neck": 4, "blue_fingers": 5, "blue_toes": 4, "mouth_full": 9, "slim_humanoid": 4, "rider": 6, "half_body": 4, "pig_humanoid": 3, "blue_neck": 3, "nervous_expression": 5, "pink_legs": 3}, "extra_evidence": {"2_tails": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4672}, "abs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3223}, "action_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.617}, "amber_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4076}, "animal_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6159}, "anthro": {"source": "structural"}, "blue_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.48}, "blue_fingers": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5077}, "blue_neck": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5222}, "blue_pawpads": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4891}, "blue_paws": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4986}, "blue_toes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5148}, "body_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.305}, "bovid": {"source": "implied"}, "bovid_humanoid": {"source": "implied"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9003}, "caprine": {"source": "implied"}, "caprine_humanoid": {"source": "implied"}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "female_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.563}, "fighting_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4593}, "goat_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5534}, "grey_nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4662}, "half-closed_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3629}, "half_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4115}, "heterochromia": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4423}, "holding_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5079}, "humanoid": {"source": "implied"}, "inner_ear_fluff": {"source": "implied"}, "jagged_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5168}, "male": {"source": "structural"}, "male_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5627}, "mammal_humanoid": {"source": "implied"}, "melee_weapon": {"source": "implied"}, "mouth_full": {"source": "stage3", "why": "explicit", "retrieval_score": 0.458}, "multi_tail": {"source": "implied"}, "muscular": {"source": "implied"}, "muscular_male": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3102}, "narrow_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5074}, "narrowed_eyes": {"source": "implied"}, "nervous_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4772}, "no_irises": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4008}, "open_smile": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4868}, "paw_pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5582}, "pawpads": {"source": "implied"}, "pig_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5894}, "pink_ears": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5255}, "pink_legs": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5285}, "pink_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5127}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.682}, "pink_tongue": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4215}, "polearm": {"source": "implied"}, "pose": {"source": "implied"}, "pupils": {"source": "implied"}, "purple_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5414}, "purple_face": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5577}, "rider": {"source": "stage3", "why": "explicit", "retrieval_score": 0.2712}, "slim_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.588}, "slit_pupils": {"source": "stage3", "why": "explicit", "retrieval_score": 0.396}, "smile": {"source": "implied"}, "striped_neck": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5948}, "stripes": {"source": "implied"}, "suid": {"source": "implied"}, "suid_humanoid": {"source": "implied"}, "suina": {"source": "implied"}, "suina_humanoid": {"source": "implied"}, "tail": {"source": "implied"}, "teeth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3603}, "tongue": {"source": "implied"}, "trident": {"source": "stage3", "why": "explicit", "retrieval_score": 0.2683}, "tuft": {"source": "implied"}, "two_tone_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5197}, "wave": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3632}, "weapon": {"source": "implied"}, "white_inner_ear_fluff": {"source": "stage3", "why": "explicit", "retrieval_score": 0.597}, "white_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5202}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 1.23, "t2": 2.07, "t3": 0.74, "t3s": 2.63, "t3p": 2.06, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=205 entity=4 copyright_filtered=2 generic_char_to_general=0 unknown_type=5"]}
data/eval_results/smoke_no_why_explicit_only_n1.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T04:05:15.045798", "n_samples": 1, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 1, "n_issues_total": 2}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 16, "n_selected": 32, "n_implied": 16, "n_structural": 3, "n_probe": 4, "ret_R": 0.2727, "P": 0.5, "R": 0.7273, "F1": 0.5926, "leaf_P": 0.4167, "leaf_R": 0.3846, "leaf_F1": 0.4, "n_leaf_sel": 12, "n_leaf_gt": 13, "ret_P": 0.375, "sel_given_ret": 2.6667, "over_sel": 1.45, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.7273, "gen_F1": 0.5926, "missed": ["bass_guitar", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["bottomwear", "canis", "denim", "denim_clothing", "flowing_hair", "jeans", "pants", "pastel_background", "playing_guitar", "playing_music", "pose", "string", "torn_bottomwear", "torn_jeans", "torn_pants", "wolf"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bottomwear", "canid", "canine", "canis", "claws", "clothed", "clothing", "denim", "denim_clothing", "flowing_hair", "guitar", "hair", "jeans", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "pose", "solo", "spade_tail", "string", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants", "wolf"], "stage3_selected": ["claws", "flowing_hair", "guitar", "pastel_background", "playing_guitar", "pose", "spade_tail", "string", "tail", "torn_jeans", "wolf"], "stage3_selected_scores": {"tail": 0.5423, "claws": 0.5488, "wolf": 0.564, "pose": 0.5518, "spade_tail": 0.5579, "guitar": 0.9729, "playing_guitar": 0.9849, "torn_jeans": 0.4765, "string": 0.5804, "flowing_hair": 0.5336, "pastel_background": 0.542}, "stage3_selected_ranks": {"tail": 10, "claws": 9, "wolf": 4, "pose": 7, "spade_tail": 6, "guitar": 2, "playing_guitar": 1, "torn_jeans": 16, "string": 3, "flowing_hair": 13, "pastel_background": 11}, "stage3_selected_phrase_ranks": {"tail": 1, "claws": 1, "wolf": 1, "pose": 1, "spade_tail": 1, "guitar": 1, "playing_guitar": 1, "torn_jeans": 1, "string": 1, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "canis": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5336}, "jeans": {"source": "implied"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.542}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9849}, "playing_music": {"source": "implied"}, "pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5518}, "string": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5804}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4765}, "torn_pants": {"source": "implied"}, "wolf": {"source": "stage3", "why": "explicit", "retrieval_score": 0.564}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "clothing", "canid", "anthro"], "t1": 2.43, "t2": 3.78, "t3": 1.97, "t3s": 7.61, "t3p": 3.89, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
data/eval_results/smoke_no_why_explicit_only_n1_v2.jsonl ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-03T04:05:53.753317", "n_samples": 1, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 1, "n_issues_total": 2}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 20, "n_selected": 27, "n_implied": 16, "n_structural": 3, "n_probe": 3, "ret_R": 0.2727, "P": 0.5556, "R": 0.6818, "F1": 0.6122, "leaf_P": 0.5, "leaf_R": 0.3846, "leaf_F1": 0.4348, "n_leaf_sel": 10, "n_leaf_gt": 13, "ret_P": 0.3, "sel_given_ret": 2.5, "over_sel": 1.23, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 7, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5556, "gen_R": 0.6818, "gen_F1": 0.6122, "missed": ["bass_guitar", "canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["bottomwear", "denim", "denim_clothing", "flowing_hair", "jeans", "pants", "pastel_background", "playing_guitar", "playing_music", "torn_bottomwear", "torn_jeans", "torn_pants"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bottomwear", "canid", "claws", "clothed", "clothing", "denim", "denim_clothing", "flowing_hair", "guitar", "hair", "jeans", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants"], "stage3_selected": ["claws", "flowing_hair", "guitar", "pastel_background", "playing_guitar", "spade_tail", "torn_jeans"], "stage3_selected_scores": {"claws": 0.5465, "spade_tail": 0.5572, "guitar": 0.9726, "playing_guitar": 0.9847, "torn_jeans": 0.4758, "flowing_hair": 0.5328, "pastel_background": 0.5404}, "stage3_selected_ranks": {"claws": 9, "spade_tail": 5, "guitar": 2, "playing_guitar": 1, "torn_jeans": 18, "flowing_hair": 14, "pastel_background": 12}, "stage3_selected_phrase_ranks": {"claws": 1, "spade_tail": 1, "guitar": 1, "playing_guitar": 1, "torn_jeans": 1, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5328}, "jeans": {"source": "implied"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "unknown", "retrieval_score": 0.5404}, "playing_guitar": {"source": "stage3", "why": "unknown", "retrieval_score": 0.9847}, "playing_music": {"source": "implied"}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "unknown", "retrieval_score": 0.4758}, "torn_pants": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed"], "probe": ["solo", "canid", "anthro"], "t1": 2.25, "t2": 3.46, "t3": 3.76, "t3s": 4.18, "t3p": 6.08, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
data/eval_results/why_gate_compare_explicit_n10.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T12:34:17.390682", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "explicit", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 20, "n_selected": 27, "n_implied": 16, "n_structural": 3, "n_probe": 3, "ret_R": 0.2727, "P": 0.5556, "R": 0.6818, "F1": 0.6122, "leaf_P": 0.5, "leaf_R": 0.3846, "leaf_F1": 0.4348, "n_leaf_sel": 10, "n_leaf_gt": 13, "ret_P": 0.3, "sel_given_ret": 2.5, "over_sel": 1.23, "why": {"explicit": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"22": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5556, "gen_R": 0.6818, "gen_F1": 0.6122, "missed": ["bass_guitar", "canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["bottomwear", "denim", "denim_clothing", "flowing_hair", "jeans", "pants", "pastel_background", "playing_guitar", "playing_music", "torn_bottomwear", "torn_jeans", "torn_pants"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bottomwear", "canid", "claws", "clothed", "clothing", "denim", "denim_clothing", "flowing_hair", "guitar", "hair", "jeans", "mammal", "musical_instrument", "pants", "pastel_background", "playing_guitar", "playing_music", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants"], "stage3_selected": ["claws", "flowing_hair", "pastel_background", "playing_guitar", "spade_tail", "tail", "torn_jeans"], "stage3_selected_scores": {"tail": 0.5404, "claws": 0.5465, "spade_tail": 0.5572, "playing_guitar": 0.9847, "torn_jeans": 0.4758, "flowing_hair": 0.5328, "pastel_background": 0.5404}, "stage3_selected_ranks": {"tail": 11, "claws": 9, "spade_tail": 5, "playing_guitar": 1, "torn_jeans": 18, "flowing_hair": 14, "pastel_background": 12}, "stage3_selected_phrase_ranks": {"tail": 1, "claws": 1, "spade_tail": 1, "playing_guitar": 1, "torn_jeans": 1, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5328}, "jeans": {"source": "implied"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5404}, "playing_guitar": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9847}, "playing_music": {"source": "implied"}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4758}, "torn_pants": {"source": "implied"}}, "structural": ["solo", "anthro", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.88, "t2": 3.41, "t3": 4.5, "t3s": 3.76, "t3p": 5.46, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=22 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 11, "n_selected": 13, "n_implied": 0, "n_structural": 4, "n_probe": 3, "ret_R": 0.75, "P": 0.3077, "R": 1.0, "F1": 0.4706, "leaf_P": 0.3077, "leaf_R": 1.0, "leaf_F1": 0.4706, "n_leaf_sel": 13, "n_leaf_gt": 4, "ret_P": 0.2727, "sel_given_ret": 1.3333, "over_sel": 3.25, "why": {"explicit": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"14": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3077, "gen_R": 1.0, "gen_F1": 0.4706, "missed": [], "extra": ["ambiguous_gender", "anthro", "big_eyes", "cartoon", "eyes", "feral", "nose", "nude", "spots"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "big_eyes", "cartoon", "eyes", "feral", "nose", "nude", "red_nose", "smile", "solo", "spots", "tan_body"], "stage3_selected": ["big_eyes", "cartoon", "eyes", "nose", "red_nose", "smile", "spots", "tan_body", "white_background"], "stage3_selected_scores": {"smile": 0.6013, "white_background": 0.6138, "tan_body": 0.6627, "spots": 0.6272, "big_eyes": 0.696, "red_nose": 0.7501, "cartoon": 0.5003, "nose": 0.8607, "eyes": 0.9241}, "stage3_selected_ranks": {"smile": 10, "white_background": 9, "tan_body": 5, "spots": 8, "big_eyes": 4, "red_nose": 3, "cartoon": 13, "nose": 2, "eyes": 1}, "stage3_selected_phrase_ranks": {"smile": 1, "white_background": 1, "tan_body": 1, "spots": 1, "big_eyes": 1, "red_nose": 1, "cartoon": 1, "nose": 1, "eyes": 1}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "big_eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.696}, "cartoon": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5003}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9241}, "feral": {"source": "structural"}, "nose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8607}, "nude": {"source": "structural"}, "spots": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6272}}, "structural": ["solo", "feral", "ambiguous_gender", "nude"], "probe": ["simple_background", "anthro", "solo"], "t1": 1.45, "t2": 1.09, "t3": 2.54, "t3s": 0.76, "t3p": 0.94, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=14 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 15, "n_selected": 15, "n_implied": 3, "n_structural": 4, "n_probe": 5, "ret_R": 0.3571, "P": 0.5333, "R": 0.5714, "F1": 0.5517, "leaf_P": 0.5455, "leaf_R": 0.6667, "leaf_F1": 0.6, "n_leaf_sel": 11, "n_leaf_gt": 9, "ret_P": 0.3333, "sel_given_ret": 1.6, "over_sel": 1.07, "why": {"explicit": 5}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5333, "gen_R": 0.5714, "gen_F1": 0.5517, "missed": ["lagomorph", "leporid", "mammal", "rabbit", "romantic", "romantic_couple"], "extra": ["<3", "coat", "holding_object", "holding_plushie", "looking_at_viewer", "relationship", "topwear"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "holding_object", "holding_plushie", "looking_at_viewer", "plushie", "relationship", "teal_eyes", "topwear"], "stage3_selected": ["blue_eyes", "coat", "holding_plushie", "relationship", "teal_eyes"], "stage3_selected_scores": {"blue_eyes": 0.6151, "coat": 0.6383, "teal_eyes": 0.6283, "holding_plushie": 0.7793, "relationship": 0.6206}, "stage3_selected_ranks": {"blue_eyes": 9, "coat": 5, "teal_eyes": 6, "holding_plushie": 2, "relationship": 7}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "coat": 1, "teal_eyes": 1, "holding_plushie": 1, "relationship": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "looking_at_viewer": {"source": "structural"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 2.38, "t2": 1.53, "t3": 3.67, "t3s": 0.93, "t3p": 1.03, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 17, "n_selected": 28, "n_implied": 13, "n_structural": 4, "n_probe": 3, "ret_R": 0.48, "P": 0.7143, "R": 0.8, "F1": 0.7547, "leaf_P": 0.7143, "leaf_R": 0.6667, "leaf_F1": 0.6897, "n_leaf_sel": 14, "n_leaf_gt": 15, "ret_P": 0.7059, "sel_given_ret": 1.6667, "over_sel": 1.12, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 16, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7143, "gen_R": 0.8, "gen_F1": 0.7547, "missed": ["canid", "canine", "crossed_arms", "fox", "looking_at_another"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "looking_at_viewer", "open_mouth", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "bottomwear", "claws", "clothed", "clothing", "duo", "facial_markings", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "rabbit", "shirt", "standing", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "claws", "facial_markings", "fur", "grey_background", "open_mouth", "overalls", "rabbit", "shirt", "standing", "white_shirt"], "stage3_selected_scores": {"fur": 0.6532, "open_mouth": 0.6331, "claws": 0.6304, "standing": 0.6879, "shirt": 0.7484, "rabbit": 0.6511, "grey_background": 0.6785, "facial_markings": 0.6946, "white_shirt": 0.8198, "overalls": 0.8776, "black_pants": 0.8331}, "stage3_selected_ranks": {"fur": 12, "open_mouth": 15, "claws": 16, "standing": 9, "shirt": 6, "rabbit": 13, "grey_background": 11, "facial_markings": 8, "white_shirt": 4, "overalls": 2, "black_pants": 3}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "standing": 1, "shirt": 1, "rabbit": 1, "grey_background": 1, "facial_markings": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8331}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6331}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8198}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 1.78, "t2": 1.56, "t3": 3.98, "t3s": 0.93, "t3p": 3.06, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 21, "n_selected": 19, "n_implied": 5, "n_structural": 4, "n_probe": 4, "ret_R": 0.3077, "P": 0.5263, "R": 0.7692, "F1": 0.625, "leaf_P": 0.3846, "leaf_R": 0.8333, "leaf_F1": 0.5263, "n_leaf_sel": 13, "n_leaf_gt": 6, "ret_P": 0.1905, "sel_given_ret": 2.5, "over_sel": 1.46, "why": {"explicit": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 20, "attempts_by_n_local": {"20": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5263, "gen_R": 0.7692, "gen_F1": 0.625, "missed": ["fur", "white_body", "white_fur"], "extra": ["anthro", "clothed", "clothing", "darkness", "group", "light", "lying_on_ground", "note", "solo"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "bovid", "caprine", "clothed", "clothing", "darkness", "dialogue", "goat", "group", "human", "light", "lizard", "lying_on_ground", "mammal", "note", "reptile", "scalie", "solo", "text"], "stage3_selected": ["darkness", "dialogue", "goat", "group", "human", "light", "lizard", "lying_on_ground", "note"], "stage3_selected_scores": {"dialogue": 0.7405, "group": 0.6236, "human": 0.664, "lizard": 0.8364, "goat": 0.775, "light": 0.7785, "lying_on_ground": 0.7876, "darkness": 0.8348, "note": 0.7377}, "stage3_selected_ranks": {"dialogue": 10, "group": 15, "human": 13, "lizard": 1, "goat": 5, "light": 4, "lying_on_ground": 3, "darkness": 2, "note": 11}, "stage3_selected_phrase_ranks": {"dialogue": 1, "group": 1, "human": 1, "lizard": 1, "goat": 1, "light": 1, "lying_on_ground": 1, "darkness": 1, "note": 1}, "extra_evidence": {"anthro": {"source": "probe"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8348}, "group": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6236}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7785}, "lying_on_ground": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7876}, "note": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7377}, "solo": {"source": "structural"}}, "structural": ["solo", "group", "clothed", "text"], "probe": ["clothing", "simple_background", "anthro", "text"], "t1": 2.79, "t2": 1.87, "t3": 5.37, "t3s": 0.99, "t3p": 3.86, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=20 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=0"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 24, "n_selected": 15, "n_implied": 1, "n_structural": 3, "n_probe": 2, "ret_R": 0.6, "P": 0.6667, "R": 0.6667, "F1": 0.6667, "leaf_P": 0.5714, "leaf_R": 0.6667, "leaf_F1": 0.6154, "n_leaf_sel": 14, "n_leaf_gt": 12, "ret_P": 0.375, "sel_given_ret": 1.1111, "over_sel": 1.0, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 18, "attempts_by_n_local": {"25": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.6667, "gen_R": 0.6667, "gen_F1": 0.6667, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture"], "extra": ["annoyed_expression", "anthro", "atmosphere", "humanoid", "playful"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "atmosphere", "blonde_hair", "blue_eyes", "duo", "green_eyes", "hair", "humanoid", "lying", "makeup", "playful", "purple_hair", "sleeping", "text"], "stage3_selected": ["annoyed_expression", "atmosphere", "blonde_hair", "blue_eyes", "green_eyes", "lying", "makeup", "playful", "purple_hair", "sleeping", "text"], "stage3_selected_scores": {"text": 0.6007, "blue_eyes": 0.6013, "lying": 0.4494, "green_eyes": 0.5989, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "sleeping": 0.6027, "playful": 0.4463, "annoyed_expression": 0.7251, "atmosphere": 0.5048}, "stage3_selected_ranks": {"text": 8, "blue_eyes": 7, "lying": 22, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "sleeping": 6, "playful": 23, "annoyed_expression": 2, "atmosphere": 18}, "stage3_selected_phrase_ranks": {"text": 1, "blue_eyes": 1, "lying": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "sleeping": 1, "playful": 1, "annoyed_expression": 1, "atmosphere": 1}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "atmosphere": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5048}, "humanoid": {"source": "structural"}, "playful": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4463}}, "structural": ["duo", "humanoid", "text"], "probe": ["anthro", "duo"], "t1": 1.99, "t2": 2.09, "t3": 3.25, "t3s": 0.99, "t3p": 2.55, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=25 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 23, "n_selected": 23, "n_implied": 7, "n_structural": 5, "n_probe": 3, "ret_R": 0.5455, "P": 0.3478, "R": 0.7273, "F1": 0.4706, "leaf_P": 0.2857, "leaf_R": 0.5714, "leaf_F1": 0.381, "n_leaf_sel": 14, "n_leaf_gt": 7, "ret_P": 0.2609, "sel_given_ret": 1.3333, "over_sel": 2.09, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 12, "attempts_by_n_local": {"26": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3478, "gen_R": 0.7273, "gen_F1": 0.4706, "missed": ["open_mouth", "white_body", "white_fur"], "extra": ["animal_humanoid", "anthro", "blue_stripes", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "humanoid", "intersex", "looking_at_viewer", "mammal_humanoid", "pink_stripes", "stripes", "tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["animal_humanoid", "anthro", "blue_eyes", "blue_nose", "blue_stripes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "fur", "humanoid", "intersex", "looking_at_viewer", "mammal", "mammal_humanoid", "pink_stripes", "purple_body", "solo", "stripes", "tail"], "stage3_selected": ["blue_eyes", "blue_nose", "blue_stripes", "canine_humanoid", "curved_tail", "fur", "humanoid", "pink_stripes", "purple_body", "simple_background", "tail"], "stage3_selected_scores": {"fur": 0.5887, "simple_background": 0.5994, "tail": 0.6162, "blue_eyes": 0.6045, "humanoid": 0.675, "purple_body": 0.5693, "canine_humanoid": 0.9013, "blue_nose": 0.6049, "blue_stripes": 0.6786, "pink_stripes": 0.6846, "curved_tail": 0.6409}, "stage3_selected_ranks": {"fur": 18, "simple_background": 17, "tail": 10, "blue_eyes": 14, "humanoid": 6, "purple_body": 20, "canine_humanoid": 1, "blue_nose": 13, "blue_stripes": 5, "pink_stripes": 4, "curved_tail": 7}, "stage3_selected_phrase_ranks": {"fur": 1, "simple_background": 1, "tail": 1, "blue_eyes": 1, "humanoid": 1, "purple_body": 1, "canine_humanoid": 1, "blue_nose": 1, "blue_stripes": 1, "pink_stripes": 1, "curved_tail": 1}, "extra_evidence": {"animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "blue_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6786}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9013}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6409}, "humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.675}, "intersex": {"source": "structural"}, "looking_at_viewer": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "pink_stripes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6846}, "stripes": {"source": "implied"}, "tail": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6162}}, "structural": ["solo", "anthro", "intersex", "clothed", "looking_at_viewer"], "probe": ["anthro", "canid", "solo"], "t1": 8.0, "t2": 2.08, "t3": 3.24, "t3s": 1.22, "t3p": 1.73, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=26 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=4"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 20, "n_selected": 23, "n_implied": 6, "n_structural": 5, "n_probe": 3, "ret_R": 0.1818, "P": 0.5652, "R": 0.5909, "F1": 0.5778, "leaf_P": 0.2667, "leaf_R": 0.3333, "leaf_F1": 0.2963, "n_leaf_sel": 15, "n_leaf_gt": 12, "ret_P": 0.2, "sel_given_ret": 3.25, "over_sel": 1.05, "why": {"explicit": 11}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 15, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5652, "gen_R": 0.5909, "gen_F1": 0.5778, "missed": ["chest_tuft", "countershading", "hand_on_head", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "tuft"], "extra": ["countershade_body", "eyes", "gesture", "pose", "raised_hand", "striped_body", "striped_fur", "white_chest", "yellow_bottomwear", "yellow_clothing"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "eyes", "felid", "fur", "gesture", "male", "mammal", "pose", "raised_hand", "shorts", "solo", "striped_body", "striped_fur", "stripes", "topless", "white_chest", "yellow_bottomwear", "yellow_clothing"], "stage3_selected": ["blue_eyes", "countershade_body", "eyes", "fur", "gesture", "pose", "raised_hand", "shorts", "striped_fur", "white_chest", "yellow_bottomwear"], "stage3_selected_scores": {"fur": 0.597, "blue_eyes": 0.5852, "pose": 0.638, "shorts": 0.5953, "gesture": 0.6013, "striped_fur": 0.6559, "raised_hand": 0.7033, "yellow_bottomwear": 0.6671, "white_chest": 0.9245, "countershade_body": 0.872, "eyes": 0.9776}, "stage3_selected_ranks": {"fur": 14, "blue_eyes": 16, "pose": 10, "shorts": 15, "gesture": 13, "striped_fur": 9, "raised_hand": 6, "yellow_bottomwear": 8, "white_chest": 2, "countershade_body": 3, "eyes": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "blue_eyes": 1, "pose": 1, "shorts": 1, "gesture": 1, "striped_fur": 1, "raised_hand": 1, "yellow_bottomwear": 1, "white_chest": 1, "countershade_body": 1, "eyes": 1}, "extra_evidence": {"countershade_body": {"source": "stage3", "why": "explicit", "retrieval_score": 0.872}, "eyes": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9776}, "gesture": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6013}, "pose": {"source": "stage3", "why": "explicit", "retrieval_score": 0.638}, "raised_hand": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7033}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6559}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9245}, "yellow_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6671}, "yellow_clothing": {"source": "implied"}}, "structural": ["solo", "anthro", "male", "clothed", "topless"], "probe": ["anthro", "felid", "solo"], "t1": 1.82, "t2": 1.77, "t3": 2.75, "t3s": 0.94, "t3p": 1.67, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 15, "n_selected": 24, "n_implied": 7, "n_structural": 4, "n_probe": 5, "ret_R": 0.25, "P": 0.4167, "R": 0.8333, "F1": 0.5556, "leaf_P": 0.3846, "leaf_R": 0.5556, "leaf_F1": 0.4545, "n_leaf_sel": 13, "n_leaf_gt": 9, "ret_P": 0.2, "sel_given_ret": 3.3333, "over_sel": 2.0, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 18, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4167, "gen_R": 0.8333, "gen_F1": 0.5556, "missed": ["fingers", "male"], "extra": ["black_body", "black_fur", "holding_mug", "holding_object", "mug", "necktie", "shirt", "teal_shirt", "teal_topwear", "text", "topless", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["alpha_channel", "anthro", "black_body", "black_fur", "clothed", "clothing", "felid", "feline", "fur", "hair", "holding_mug", "holding_object", "mammal", "mug", "necktie", "shirt", "solo", "teal_shirt", "teal_topwear", "text", "topless", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "feline", "fur", "hair", "holding_mug", "mug", "necktie", "shirt", "teal_shirt", "transparent_background", "vest", "white_necktie"], "stage3_selected_scores": {"hair": 0.7279, "fur": 0.7575, "feline": 0.7328, "shirt": 0.8216, "black_fur": 0.7477, "necktie": 0.7525, "transparent_background": 0.7407, "vest": 0.8646, "mug": 0.8935, "holding_mug": 0.9171, "teal_shirt": 0.7462, "white_necktie": 0.6377}, "stage3_selected_ranks": {"hair": 13, "fur": 6, "feline": 12, "shirt": 4, "black_fur": 9, "necktie": 8, "transparent_background": 11, "vest": 3, "mug": 2, "holding_mug": 1, "teal_shirt": 10, "white_necktie": 16}, "stage3_selected_phrase_ranks": {"hair": 1, "fur": 1, "feline": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "transparent_background": 1, "vest": 1, "mug": 1, "holding_mug": 1, "teal_shirt": 1, "white_necktie": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7477}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9171}, "holding_object": {"source": "implied"}, "mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8935}, "necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7525}, "shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8216}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7462}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topless": {"source": "structural"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8646}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6377}}, "structural": ["solo", "anthro", "clothed", "topless"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 1.76, "t2": 1.82, "t3": 2.02, "t3s": 2.36, "t3p": 1.38, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 18, "n_selected": 22, "n_implied": 5, "n_structural": 5, "n_probe": 5, "ret_R": 0.5, "P": 0.5, "R": 0.7857, "F1": 0.6111, "leaf_P": 0.3571, "leaf_R": 0.5, "leaf_F1": 0.4167, "n_leaf_sel": 14, "n_leaf_gt": 10, "ret_P": 0.3889, "sel_given_ret": 1.5714, "over_sel": 1.57, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"21": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5, "gen_R": 0.7857, "gen_F1": 0.6111, "missed": ["fur", "hair", "human"], "extra": ["anthro", "bottomwear", "cheeky", "duo", "grin", "laugh", "loincloth", "raised_arms", "smile", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "bottomwear", "cheeky", "clothed", "clothing", "dancing", "duo", "grin", "group", "haplorhine", "laugh", "loincloth", "looking_at_viewer", "male", "mammal", "primate", "raised_arms", "smile", "topless", "trio"], "stage3_selected": ["ape", "bear", "cheeky", "dancing", "grin", "laugh", "loincloth", "looking_at_viewer", "male", "primate", "raised_arms", "simple_background"], "stage3_selected_scores": {"male": 0.5579, "simple_background": 0.5466, "looking_at_viewer": 0.5455, "bear": 0.5731, "grin": 0.5635, "primate": 0.89, "loincloth": 0.5677, "dancing": 0.5556, "laugh": 0.5253, "ape": 0.9764, "raised_arms": 0.5437, "cheeky": 0.3888}, "stage3_selected_ranks": {"male": 6, "simple_background": 8, "looking_at_viewer": 9, "bear": 3, "grin": 5, "primate": 2, "loincloth": 4, "dancing": 7, "laugh": 13, "ape": 1, "raised_arms": 10, "cheeky": 20}, "stage3_selected_phrase_ranks": {"male": 1, "simple_background": 1, "looking_at_viewer": 1, "bear": 1, "grin": 1, "primate": 1, "loincloth": 1, "dancing": 1, "laugh": 1, "ape": 1, "raised_arms": 1, "cheeky": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "bottomwear": {"source": "implied"}, "cheeky": {"source": "stage3", "why": "explicit", "retrieval_score": 0.3888}, "duo": {"source": "probe"}, "grin": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5635}, "laugh": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5253}, "loincloth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5677}, "raised_arms": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5437}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "male", "clothed", "topless"], "probe": ["anthro", "duo", "group", "bear", "simple_background"], "t1": 8.77, "t2": 2.21, "t3": 4.46, "t3s": 1.39, "t3p": 1.6, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=21 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
data/eval_results/why_gate_compare_strong_implied_n10.jsonl ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"_meta": true, "timestamp": "2026-03-02T12:32:32.397015", "n_samples": 10, "caption_field": "caption_cogvlm", "skip_rewrite": false, "allow_nsfw": false, "mode": "chunked_map_union", "chunk_size": 60, "eval_path": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl", "per_phrase_k": 2, "per_phrase_final_k": 1, "temperature": 0.0, "shuffle": true, "seed": 42, "workers": 1, "min_why": "strong_implied", "expand_implications": true, "infer_structural": true, "infer_probe": true, "n_errors": 0, "n_issue_samples": 10, "n_issues_total": 20}
2
+ {"id": 17482, "n_gt": 22, "n_retrieved": 14, "n_selected": 26, "n_implied": 15, "n_structural": 4, "n_probe": 3, "ret_R": 0.2273, "P": 0.5769, "R": 0.6818, "F1": 0.625, "leaf_P": 0.4545, "leaf_R": 0.3846, "leaf_F1": 0.4167, "n_leaf_sel": 11, "n_leaf_gt": 13, "ret_P": 0.3571, "sel_given_ret": 3.0, "over_sel": 1.18, "why": {"explicit": 6}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 11, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5769, "gen_R": 0.6818, "gen_F1": 0.625, "missed": ["bass_guitar", "canine", "fingers", "fur", "holding_musical_instrument", "holding_object", "music"], "extra": ["bottomwear", "denim", "denim_clothing", "flowing_hair", "jeans", "male", "pants", "pastel_background", "torn_bottomwear", "torn_jeans", "torn_pants"], "ground_truth_tags": ["anthro", "bass_guitar", "canid", "canine", "claws", "clothed", "clothing", "fingers", "fur", "guitar", "hair", "holding_musical_instrument", "holding_object", "mammal", "music", "musical_instrument", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_clothing"], "selected_tags": ["anthro", "bottomwear", "canid", "claws", "clothed", "clothing", "denim", "denim_clothing", "flowing_hair", "guitar", "hair", "jeans", "male", "mammal", "musical_instrument", "pants", "pastel_background", "plucked_string_instrument", "solo", "spade_tail", "string_instrument", "tail", "torn_bottomwear", "torn_clothing", "torn_jeans", "torn_pants"], "stage3_selected": ["claws", "flowing_hair", "guitar", "pastel_background", "spade_tail", "torn_jeans"], "stage3_selected_scores": {"claws": 0.5684, "spade_tail": 0.618, "guitar": 0.9623, "torn_jeans": 0.4824, "flowing_hair": 0.5669, "pastel_background": 0.5632}, "stage3_selected_ranks": {"claws": 8, "spade_tail": 3, "guitar": 1, "torn_jeans": 15, "flowing_hair": 9, "pastel_background": 11}, "stage3_selected_phrase_ranks": {"claws": 1, "spade_tail": 1, "guitar": 1, "torn_jeans": 1, "flowing_hair": 1, "pastel_background": 1}, "extra_evidence": {"bottomwear": {"source": "implied"}, "denim": {"source": "implied"}, "denim_clothing": {"source": "implied"}, "flowing_hair": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5669}, "jeans": {"source": "implied"}, "male": {"source": "structural"}, "pants": {"source": "implied"}, "pastel_background": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5632}, "torn_bottomwear": {"source": "implied"}, "torn_jeans": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4824}, "torn_pants": {"source": "implied"}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.17, "t2": 2.75, "t3": 4.15, "t3s": 4.67, "t3p": 4.24, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
3
+ {"id": 1624724, "n_gt": 4, "n_retrieved": 1, "n_selected": 6, "n_implied": 0, "n_structural": 5, "n_probe": 3, "ret_R": 0.0, "P": 0.1667, "R": 0.25, "F1": 0.2, "leaf_P": 0.1667, "leaf_R": 0.25, "leaf_F1": 0.2, "n_leaf_sel": 6, "n_leaf_gt": 4, "ret_P": 0.0, "sel_given_ret": 0.0, "over_sel": 1.5, "why": {}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 18, "dupe_indices_total": 0, "kept_total": 1, "attempts_by_n_local": {"1": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.1667, "gen_R": 0.25, "gen_F1": 0.2, "missed": ["red_nose", "smile", "tan_body"], "extra": ["ambiguous_gender", "anthro", "feral", "looking_at_viewer", "nude"], "ground_truth_tags": ["red_nose", "smile", "solo", "tan_body"], "selected_tags": ["ambiguous_gender", "anthro", "feral", "looking_at_viewer", "nude", "solo"], "stage3_selected": [], "stage3_selected_scores": {}, "stage3_selected_ranks": {}, "stage3_selected_phrase_ranks": {}, "extra_evidence": {"ambiguous_gender": {"source": "structural"}, "anthro": {"source": "probe"}, "feral": {"source": "structural"}, "looking_at_viewer": {"source": "structural"}, "nude": {"source": "structural"}}, "structural": ["solo", "feral", "ambiguous_gender", "nude", "looking_at_viewer"], "probe": ["simple_background", "anthro", "solo"], "t1": 8.13, "t2": 0.12, "t3": 6.49, "t3s": 1.65, "t3p": 4.77, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=1 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
4
+ {"id": 1078019, "n_gt": 14, "n_retrieved": 15, "n_selected": 27, "n_implied": 6, "n_structural": 4, "n_probe": 5, "ret_R": 0.3571, "P": 0.5185, "R": 1.0, "F1": 0.6829, "leaf_P": 0.4211, "leaf_R": 0.8889, "leaf_F1": 0.5714, "n_leaf_sel": 19, "n_leaf_gt": 9, "ret_P": 0.3333, "sel_given_ret": 2.8, "over_sel": 1.93, "why": {"explicit": 5, "strong_implied": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"16": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5185, "gen_R": 1.0, "gen_F1": 0.6829, "missed": [], "extra": ["<3", "coat", "holding_object", "holding_plushie", "looking_at_viewer", "relationship", "relaxed_expression", "round_eyes", "setting", "small_eyes", "surprised_expression", "topwear", "worried"], "ground_truth_tags": ["anthro", "blue_eyes", "blush", "clothed", "clothing", "duo", "lagomorph", "leporid", "mammal", "plushie", "rabbit", "romantic", "romantic_couple", "teal_eyes"], "selected_tags": ["<3", "anthro", "blue_eyes", "blush", "clothed", "clothing", "coat", "duo", "holding_object", "holding_plushie", "lagomorph", "leporid", "looking_at_viewer", "mammal", "plushie", "rabbit", "relationship", "relaxed_expression", "romantic", "romantic_couple", "round_eyes", "setting", "small_eyes", "surprised_expression", "teal_eyes", "topwear", "worried"], "stage3_selected": ["blue_eyes", "coat", "holding_plushie", "plushie", "rabbit", "relationship", "relaxed_expression", "romantic_couple", "round_eyes", "setting", "small_eyes", "surprised_expression", "teal_eyes", "worried"], "stage3_selected_scores": {"blue_eyes": 0.6151, "coat": 0.6383, "teal_eyes": 0.6283, "holding_plushie": 0.7793, "relationship": 0.6206, "rabbit": 0.5939, "romantic_couple": 0.5621, "plushie": 0.7455, "worried": 0.5495, "surprised_expression": 0.639, "small_eyes": 0.6187, "round_eyes": 0.4887, "relaxed_expression": 0.5218, "setting": 0.5567}, "stage3_selected_ranks": {"blue_eyes": 9, "coat": 5, "teal_eyes": 6, "holding_plushie": 2, "relationship": 7, "rabbit": 10, "romantic_couple": 12, "plushie": 3, "worried": 14, "surprised_expression": 4, "small_eyes": 8, "round_eyes": 16, "relaxed_expression": 15, "setting": 13}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "coat": 1, "teal_eyes": 1, "holding_plushie": 1, "relationship": 1, "rabbit": 1, "romantic_couple": 1, "plushie": 1, "worried": 1, "surprised_expression": 1, "small_eyes": 1, "round_eyes": 1, "relaxed_expression": 1, "setting": 1}, "extra_evidence": {"<3": {"source": "probe"}, "coat": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6383}, "holding_object": {"source": "implied"}, "holding_plushie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7793}, "looking_at_viewer": {"source": "structural"}, "relationship": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6206}, "relaxed_expression": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5218}, "round_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4887}, "setting": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5567}, "small_eyes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6187}, "surprised_expression": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.639}, "topwear": {"source": "implied"}, "worried": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5495}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["clothing", "anthro", "blush", "duo", "<3"], "t1": 1.8, "t2": 1.52, "t3": 7.56, "t3s": 1.82, "t3p": 1.0, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=16 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
5
+ {"id": 2021552, "n_gt": 25, "n_retrieved": 17, "n_selected": 30, "n_implied": 13, "n_structural": 4, "n_probe": 3, "ret_R": 0.48, "P": 0.7, "R": 0.84, "F1": 0.7636, "leaf_P": 0.6875, "leaf_R": 0.7333, "leaf_F1": 0.7097, "n_leaf_sel": 16, "n_leaf_gt": 15, "ret_P": 0.7059, "sel_given_ret": 1.75, "over_sel": 1.2, "why": {"explicit": 11, "strong_implied": 2}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 16, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.7, "gen_R": 0.84, "gen_F1": 0.7636, "missed": ["canid", "canine", "fox", "looking_at_another"], "extra": ["black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "looking_at_viewer", "open_mouth", "white_clothing", "white_shirt", "white_topwear"], "ground_truth_tags": ["anthro", "bottomwear", "canid", "canine", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fox", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_another", "mammal", "markings", "overalls", "pants", "rabbit", "shirt", "standing", "topwear"], "selected_tags": ["anthro", "black_bottomwear", "black_clothing", "black_pants", "blue_overalls", "bottomwear", "claws", "clothed", "clothing", "crossed_arms", "duo", "facial_markings", "fur", "grey_background", "head_markings", "lagomorph", "leporid", "looking_at_viewer", "mammal", "markings", "open_mouth", "overalls", "pants", "rabbit", "shirt", "standing", "topwear", "white_clothing", "white_shirt", "white_topwear"], "stage3_selected": ["black_pants", "blue_overalls", "claws", "crossed_arms", "facial_markings", "fur", "grey_background", "open_mouth", "overalls", "rabbit", "shirt", "standing", "white_shirt"], "stage3_selected_scores": {"fur": 0.6532, "open_mouth": 0.6331, "claws": 0.6304, "standing": 0.6879, "shirt": 0.7484, "rabbit": 0.6511, "grey_background": 0.6785, "facial_markings": 0.6946, "white_shirt": 0.8198, "overalls": 0.8776, "black_pants": 0.8331, "crossed_arms": 0.7286, "blue_overalls": 0.9203}, "stage3_selected_ranks": {"fur": 12, "open_mouth": 15, "claws": 16, "standing": 9, "shirt": 6, "rabbit": 13, "grey_background": 11, "facial_markings": 8, "white_shirt": 4, "overalls": 2, "black_pants": 3, "crossed_arms": 7, "blue_overalls": 1}, "stage3_selected_phrase_ranks": {"fur": 1, "open_mouth": 1, "claws": 1, "standing": 1, "shirt": 1, "rabbit": 1, "grey_background": 1, "facial_markings": 1, "white_shirt": 1, "overalls": 1, "black_pants": 1, "crossed_arms": 1, "blue_overalls": 1}, "extra_evidence": {"black_bottomwear": {"source": "implied"}, "black_clothing": {"source": "implied"}, "black_pants": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8331}, "blue_overalls": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.9203}, "looking_at_viewer": {"source": "structural"}, "open_mouth": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6331}, "white_clothing": {"source": "implied"}, "white_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8198}, "white_topwear": {"source": "implied"}}, "structural": ["duo", "anthro", "clothed", "looking_at_viewer"], "probe": ["simple_background", "anthro", "duo"], "t1": 2.43, "t2": 1.59, "t3": 4.84, "t3s": 1.56, "t3p": 3.66, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
6
+ {"id": 1023509, "n_gt": 13, "n_retrieved": 17, "n_selected": 21, "n_implied": 5, "n_structural": 5, "n_probe": 6, "ret_R": 0.2308, "P": 0.4286, "R": 0.6923, "F1": 0.5294, "leaf_P": 0.2667, "leaf_R": 0.6667, "leaf_F1": 0.381, "n_leaf_sel": 15, "n_leaf_gt": 6, "ret_P": 0.1765, "sel_given_ret": 3.0, "over_sel": 1.62, "why": {"explicit": 9}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 3, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 3, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 17, "attempts_by_n_local": {"17": {"attempts": 3, "parse_ok": 3, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4286, "gen_R": 0.6923, "gen_F1": 0.5294, "missed": ["dialogue", "fur", "white_body", "white_fur"], "extra": ["anthro", "clothed", "clothing", "darkness", "group", "light", "lying_on_ground", "note", "solo", "speech_bubble", "standing_over", "taur"], "ground_truth_tags": ["bovid", "caprine", "dialogue", "fur", "goat", "human", "lizard", "mammal", "reptile", "scalie", "text", "white_body", "white_fur"], "selected_tags": ["anthro", "bovid", "caprine", "clothed", "clothing", "darkness", "goat", "group", "human", "light", "lizard", "lying_on_ground", "mammal", "note", "reptile", "scalie", "solo", "speech_bubble", "standing_over", "taur", "text"], "stage3_selected": ["darkness", "goat", "human", "light", "lizard", "lying_on_ground", "note", "speech_bubble", "standing_over"], "stage3_selected_scores": {"human": 0.5572, "speech_bubble": 0.5746, "lizard": 0.5943, "goat": 0.5777, "light": 0.5824, "lying_on_ground": 0.5929, "darkness": 0.5977, "note": 0.5658, "standing_over": 0.5799}, "stage3_selected_ranks": {"human": 12, "speech_bubble": 9, "lizard": 3, "goat": 8, "light": 6, "lying_on_ground": 4, "darkness": 2, "note": 11, "standing_over": 7}, "stage3_selected_phrase_ranks": {"human": 1, "speech_bubble": 1, "lizard": 1, "goat": 1, "light": 1, "lying_on_ground": 1, "darkness": 1, "note": 1, "standing_over": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "clothed": {"source": "structural"}, "clothing": {"source": "probe"}, "darkness": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5977}, "group": {"source": "structural"}, "light": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5824}, "lying_on_ground": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5929}, "note": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5658}, "solo": {"source": "probe"}, "speech_bubble": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5746}, "standing_over": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5799}, "taur": {"source": "structural"}}, "structural": ["group", "anthro", "taur", "clothed", "text"], "probe": ["clothing", "simple_background", "anthro", "text", "solo", "group"], "t1": 3.15, "t2": 1.7, "t3": 6.84, "t3s": 1.68, "t3p": 3.68, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=17 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0"]}
7
+ {"id": 335343, "n_gt": 15, "n_retrieved": 22, "n_selected": 17, "n_implied": 2, "n_structural": 3, "n_probe": 2, "ret_R": 0.6, "P": 0.5882, "R": 0.6667, "F1": 0.625, "leaf_P": 0.5333, "leaf_R": 0.6667, "leaf_F1": 0.5926, "n_leaf_sel": 15, "n_leaf_gt": 12, "ret_P": 0.4091, "sel_given_ret": 1.1111, "over_sel": 1.13, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 20, "attempts_by_n_local": {"23": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5882, "gen_R": 0.6667, "gen_F1": 0.625, "missed": ["angry", "bed", "eyes_closed", "eyeshadow", "furniture"], "extra": ["annoyed_expression", "anthro", "bedroom", "humanoid", "membrane_(anatomy)", "purple_membrane", "resting"], "ground_truth_tags": ["angry", "bed", "blonde_hair", "blue_eyes", "duo", "eyes_closed", "eyeshadow", "furniture", "green_eyes", "hair", "lying", "makeup", "purple_hair", "sleeping", "text"], "selected_tags": ["annoyed_expression", "anthro", "bedroom", "blonde_hair", "blue_eyes", "duo", "green_eyes", "hair", "humanoid", "lying", "makeup", "membrane_(anatomy)", "purple_hair", "purple_membrane", "resting", "sleeping", "text"], "stage3_selected": ["annoyed_expression", "bedroom", "blonde_hair", "blue_eyes", "green_eyes", "lying", "makeup", "purple_hair", "purple_membrane", "resting", "sleeping", "text"], "stage3_selected_scores": {"text": 0.6007, "blue_eyes": 0.6013, "lying": 0.4494, "green_eyes": 0.5989, "blonde_hair": 0.5986, "purple_hair": 0.5642, "makeup": 0.5965, "bedroom": 0.4901, "sleeping": 0.6027, "resting": 0.5034, "annoyed_expression": 0.7251, "purple_membrane": 0.5791}, "stage3_selected_ranks": {"text": 8, "blue_eyes": 7, "lying": 21, "green_eyes": 9, "blonde_hair": 10, "purple_hair": 13, "makeup": 11, "bedroom": 18, "sleeping": 6, "resting": 17, "annoyed_expression": 2, "purple_membrane": 12}, "stage3_selected_phrase_ranks": {"text": 1, "blue_eyes": 1, "lying": 1, "green_eyes": 1, "blonde_hair": 1, "purple_hair": 1, "makeup": 1, "bedroom": 1, "sleeping": 1, "resting": 1, "annoyed_expression": 1, "purple_membrane": 1}, "extra_evidence": {"annoyed_expression": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7251}, "anthro": {"source": "probe"}, "bedroom": {"source": "stage3", "why": "explicit", "retrieval_score": 0.4901}, "humanoid": {"source": "structural"}, "membrane_(anatomy)": {"source": "implied"}, "purple_membrane": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5791}, "resting": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5034}}, "structural": ["duo", "humanoid", "text"], "probe": ["anthro", "duo"], "t1": 2.35, "t2": 2.12, "t3": 4.3, "t3s": 0.92, "t3p": 2.36, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=23 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
8
+ {"id": 2034167, "n_gt": 11, "n_retrieved": 20, "n_selected": 23, "n_implied": 7, "n_structural": 4, "n_probe": 3, "ret_R": 0.4545, "P": 0.3043, "R": 0.6364, "F1": 0.4118, "leaf_P": 0.3333, "leaf_R": 0.5714, "leaf_F1": 0.4211, "n_leaf_sel": 12, "n_leaf_gt": 7, "ret_P": 0.25, "sel_given_ret": 1.4, "over_sel": 2.09, "why": {"explicit": 4, "strong_implied": 7}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 20, "attempts_by_n_local": {"23": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3043, "gen_R": 0.6364, "gen_F1": 0.4118, "missed": ["blue_nose", "fur", "white_body", "white_fur"], "extra": ["action_pose", "animal_humanoid", "anthro", "canid_humanoid", "canine_humanoid", "clothed", "clothing", "curved_tail", "humanoid", "male", "mammal_humanoid", "pink_stripes", "pink_tail", "pose", "stripes", "tail"], "ground_truth_tags": ["blue_eyes", "blue_nose", "canid", "canine", "fur", "mammal", "open_mouth", "purple_body", "solo", "white_body", "white_fur"], "selected_tags": ["action_pose", "animal_humanoid", "anthro", "blue_eyes", "canid", "canid_humanoid", "canine", "canine_humanoid", "clothed", "clothing", "curved_tail", "humanoid", "male", "mammal", "mammal_humanoid", "open_mouth", "pink_stripes", "pink_tail", "pose", "purple_body", "solo", "stripes", "tail"], "stage3_selected": ["action_pose", "blue_eyes", "canine_humanoid", "curved_tail", "open_mouth", "pink_stripes", "pink_tail", "pose", "purple_body", "stripes", "tail"], "stage3_selected_scores": {"open_mouth": 0.561, "blue_eyes": 0.5539, "purple_body": 0.5189, "canine_humanoid": 0.9365, "tail": 0.5573, "stripes": 0.5407, "pose": 0.5688, "pink_tail": 0.4493, "action_pose": 0.5588, "pink_stripes": 0.4579, "curved_tail": 0.4919}, "stage3_selected_ranks": {"open_mouth": 5, "blue_eyes": 9, "purple_body": 14, "canine_humanoid": 1, "tail": 7, "stripes": 11, "pose": 4, "pink_tail": 19, "action_pose": 6, "pink_stripes": 18, "curved_tail": 17}, "stage3_selected_phrase_ranks": {"open_mouth": 1, "blue_eyes": 1, "purple_body": 1, "canine_humanoid": 1, "tail": 1, "stripes": 1, "pose": 1, "pink_tail": 1, "action_pose": 1, "pink_stripes": 1, "curved_tail": 1}, "extra_evidence": {"action_pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5588}, "animal_humanoid": {"source": "implied"}, "anthro": {"source": "structural"}, "canid_humanoid": {"source": "implied"}, "canine_humanoid": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9365}, "clothed": {"source": "structural"}, "clothing": {"source": "implied"}, "curved_tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4919}, "humanoid": {"source": "implied"}, "male": {"source": "structural"}, "mammal_humanoid": {"source": "implied"}, "pink_stripes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4579}, "pink_tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.4493}, "pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5688}, "stripes": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5407}, "tail": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5573}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["anthro", "canid", "solo"], "t1": 2.31, "t2": 2.1, "t3": 4.57, "t3s": 1.52, "t3p": 2.17, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=23 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=5"]}
9
+ {"id": 1325009, "n_gt": 22, "n_retrieved": 18, "n_selected": 22, "n_implied": 6, "n_structural": 5, "n_probe": 3, "ret_R": 0.2273, "P": 0.5909, "R": 0.5909, "F1": 0.5909, "leaf_P": 0.2667, "leaf_R": 0.3333, "leaf_F1": 0.2963, "n_leaf_sel": 15, "n_leaf_gt": 12, "ret_P": 0.2778, "sel_given_ret": 2.6, "over_sel": 1.0, "why": {"explicit": 6, "strong_implied": 4}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"18": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.5909, "gen_R": 0.5909, "gen_F1": 0.5909, "missed": ["chest_tuft", "countershading", "muscular", "muscular_anthro", "muscular_male", "pantherine", "tiger", "topless", "tuft"], "extra": ["countershade_body", "looking_at_viewer", "muscular_arms", "pose", "striped_body", "striped_fur", "white_chest", "yellow_bottomwear", "yellow_clothing"], "ground_truth_tags": ["anthro", "blue_eyes", "bottomwear", "chest_tuft", "clothed", "clothing", "countershading", "felid", "fur", "hand_on_head", "male", "mammal", "muscular", "muscular_anthro", "muscular_male", "pantherine", "shorts", "solo", "stripes", "tiger", "topless", "tuft"], "selected_tags": ["anthro", "blue_eyes", "bottomwear", "clothed", "clothing", "countershade_body", "felid", "fur", "hand_on_head", "looking_at_viewer", "male", "mammal", "muscular_arms", "pose", "shorts", "solo", "striped_body", "striped_fur", "stripes", "white_chest", "yellow_bottomwear", "yellow_clothing"], "stage3_selected": ["blue_eyes", "countershade_body", "fur", "hand_on_head", "muscular_arms", "pose", "shorts", "striped_fur", "white_chest", "yellow_bottomwear"], "stage3_selected_scores": {"blue_eyes": 0.5717, "shorts": 0.5785, "striped_fur": 0.6385, "hand_on_head": 0.5932, "yellow_bottomwear": 0.652, "white_chest": 0.9198, "fur": 0.5838, "pose": 0.6235, "muscular_arms": 0.7948, "countershade_body": 0.8754}, "stage3_selected_ranks": {"blue_eyes": 14, "shorts": 13, "striped_fur": 8, "hand_on_head": 11, "yellow_bottomwear": 7, "white_chest": 2, "fur": 12, "pose": 9, "muscular_arms": 4, "countershade_body": 3}, "stage3_selected_phrase_ranks": {"blue_eyes": 1, "shorts": 1, "striped_fur": 1, "hand_on_head": 1, "yellow_bottomwear": 1, "white_chest": 1, "fur": 1, "pose": 1, "muscular_arms": 1, "countershade_body": 1}, "extra_evidence": {"countershade_body": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.8754}, "looking_at_viewer": {"source": "structural"}, "muscular_arms": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.7948}, "pose": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.6235}, "striped_body": {"source": "implied"}, "striped_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6385}, "white_chest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.9198}, "yellow_bottomwear": {"source": "stage3", "why": "explicit", "retrieval_score": 0.652}, "yellow_clothing": {"source": "implied"}}, "structural": ["solo", "anthro", "male", "clothed", "looking_at_viewer"], "probe": ["anthro", "felid", "solo"], "t1": 2.58, "t2": 1.76, "t3": 3.35, "t3s": 1.91, "t3p": 2.8, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=18 entity=0 copyright_filtered=1 generic_char_to_general=0 unknown_type=2"]}
10
+ {"id": 3285630, "n_gt": 12, "n_retrieved": 16, "n_selected": 26, "n_implied": 8, "n_structural": 4, "n_probe": 5, "ret_R": 0.1667, "P": 0.3846, "R": 0.8333, "F1": 0.5263, "leaf_P": 0.3333, "leaf_R": 0.5556, "leaf_F1": 0.4167, "n_leaf_sel": 15, "n_leaf_gt": 9, "ret_P": 0.125, "sel_given_ret": 5.0, "over_sel": 2.17, "why": {"explicit": 12}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 18, "attempts_by_n_local": {"19": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.3846, "gen_R": 0.8333, "gen_F1": 0.5263, "missed": ["alpha_channel", "fingers"], "extra": ["black_body", "black_fur", "business_attire", "formal", "hair_bun", "holding_mug", "holding_object", "mug", "necktie", "shirt", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "ground_truth_tags": ["alpha_channel", "anthro", "clothed", "clothing", "felid", "feline", "fingers", "fur", "hair", "male", "mammal", "solo"], "selected_tags": ["anthro", "black_body", "black_fur", "business_attire", "clothed", "clothing", "felid", "feline", "formal", "fur", "hair", "hair_bun", "holding_mug", "holding_object", "male", "mammal", "mug", "necktie", "shirt", "solo", "teal_shirt", "teal_topwear", "text", "topwear", "vest", "white_necktie"], "stage3_selected": ["black_fur", "business_attire", "feline", "formal", "hair_bun", "holding_mug", "necktie", "shirt", "simple_background", "teal_shirt", "vest", "white_necktie"], "stage3_selected_scores": {"simple_background": 0.6978, "feline": 0.7062, "shirt": 0.7998, "black_fur": 0.7183, "necktie": 0.7314, "vest": 0.8403, "hair_bun": 0.6926, "holding_mug": 0.916, "formal": 0.5993, "business_attire": 0.5558, "teal_shirt": 0.7474, "white_necktie": 0.6418}, "stage3_selected_ranks": {"simple_background": 11, "feline": 10, "shirt": 5, "black_fur": 8, "necktie": 7, "vest": 3, "hair_bun": 12, "holding_mug": 1, "formal": 16, "business_attire": 18, "teal_shirt": 6, "white_necktie": 14}, "stage3_selected_phrase_ranks": {"simple_background": 1, "feline": 1, "shirt": 1, "black_fur": 1, "necktie": 1, "vest": 1, "hair_bun": 1, "holding_mug": 1, "formal": 1, "business_attire": 1, "teal_shirt": 1, "white_necktie": 1}, "extra_evidence": {"black_body": {"source": "implied"}, "black_fur": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7183}, "business_attire": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5558}, "formal": {"source": "stage3", "why": "explicit", "retrieval_score": 0.5993}, "hair_bun": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6926}, "holding_mug": {"source": "stage3", "why": "explicit", "retrieval_score": 0.916}, "holding_object": {"source": "implied"}, "mug": {"source": "implied"}, "necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7314}, "shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7998}, "teal_shirt": {"source": "stage3", "why": "explicit", "retrieval_score": 0.7474}, "teal_topwear": {"source": "implied"}, "text": {"source": "probe"}, "topwear": {"source": "implied"}, "vest": {"source": "stage3", "why": "explicit", "retrieval_score": 0.8403}, "white_necktie": {"source": "stage3", "why": "explicit", "retrieval_score": 0.6418}}, "structural": ["solo", "anthro", "male", "clothed"], "probe": ["clothing", "anthro", "text", "felid", "solo"], "t1": 1.81, "t2": 1.64, "t3": 1.67, "t3s": 1.52, "t3p": 2.57, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=19 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=2"]}
11
+ {"id": 260449, "n_gt": 14, "n_retrieved": 21, "n_selected": 21, "n_implied": 4, "n_structural": 6, "n_probe": 6, "ret_R": 0.5, "P": 0.4762, "R": 0.7143, "F1": 0.5714, "leaf_P": 0.3077, "leaf_R": 0.4, "leaf_F1": 0.3478, "n_leaf_sel": 13, "n_leaf_gt": 10, "ret_P": 0.3333, "sel_given_ret": 1.4286, "over_sel": 1.5, "why": {"strong_implied": 10}, "stage3_diag": {"mode": "chunked_map_union", "chunk_strategy": "interleave", "chunk_passes": 1, "chunk_shuffle_within_call": false, "calls_total": 1, "calls_with_selection": 1, "calls_exhausted_retries": 0, "attempts_total": 1, "attempt_errors": 0, "attempt_parse_fail": 0, "attempt_parse_ok": 1, "invalid_items_total": 0, "oob_indices_total": 0, "dupe_indices_total": 0, "kept_total": 14, "attempts_by_n_local": {"24": {"attempts": 1, "parse_ok": 1, "parse_fail": 0, "errors": 0}}, "attempt_failure_rate": 0.0, "call_exhaustion_rate": 0.0}, "n_gt_char": 0, "n_sel_char": 0, "char_F1": 1.0, "gen_P": 0.4762, "gen_R": 0.7143, "gen_F1": 0.5714, "missed": ["fur", "hair", "human", "male"], "extra": ["anthro", "bottomwear", "duo", "feral", "grin", "loincloth", "mischievous", "raised_arms", "smile", "topless", "trio"], "ground_truth_tags": ["ape", "bear", "clothed", "clothing", "dancing", "fur", "group", "hair", "haplorhine", "human", "looking_at_viewer", "male", "mammal", "primate"], "selected_tags": ["anthro", "ape", "bear", "bottomwear", "clothed", "clothing", "dancing", "duo", "feral", "grin", "group", "haplorhine", "loincloth", "looking_at_viewer", "mammal", "mischievous", "primate", "raised_arms", "smile", "topless", "trio"], "stage3_selected": ["ape", "bear", "dancing", "grin", "loincloth", "looking_at_viewer", "mischievous", "primate", "raised_arms", "simple_background"], "stage3_selected_scores": {"simple_background": 0.5541, "looking_at_viewer": 0.5522, "bear": 0.5757, "grin": 0.5711, "primate": 0.8911, "loincloth": 0.5719, "dancing": 0.562, "ape": 0.9769, "raised_arms": 0.551, "mischievous": 0.545}, "stage3_selected_ranks": {"simple_background": 8, "looking_at_viewer": 9, "bear": 3, "grin": 5, "primate": 2, "loincloth": 4, "dancing": 7, "ape": 1, "raised_arms": 10, "mischievous": 12}, "stage3_selected_phrase_ranks": {"simple_background": 1, "looking_at_viewer": 1, "bear": 1, "grin": 1, "primate": 1, "loincloth": 1, "dancing": 1, "ape": 1, "raised_arms": 1, "mischievous": 1}, "extra_evidence": {"anthro": {"source": "structural"}, "bottomwear": {"source": "implied"}, "duo": {"source": "probe"}, "feral": {"source": "structural"}, "grin": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5711}, "loincloth": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.5719}, "mischievous": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.545}, "raised_arms": {"source": "stage3", "why": "strong_implied", "retrieval_score": 0.551}, "smile": {"source": "implied"}, "topless": {"source": "structural"}, "trio": {"source": "structural"}}, "structural": ["trio", "anthro", "feral", "clothed", "topless", "looking_at_viewer"], "probe": ["clothing", "simple_background", "anthro", "duo", "group", "bear"], "t1": 2.67, "t2": 2.18, "t3": 5.74, "t3s": 1.29, "t3p": 1.13, "err": null, "issues": ["Stage3 split: general=13 entity=0 copyright_filtered=0 generic_char_to_general=0 unknown_type=0", "Stage3 split: general=24 entity=0 copyright_filtered=0 generic_char_to_general=1 unknown_type=2"]}
data/runtime_debug/eval_no_why_explicit_instruction_n10_20260303T005633Z.json ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-03-03T00:56:33Z",
3
+ "config": {
4
+ "dataset": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl",
5
+ "n": 10,
6
+ "mode": "chunked_map_union",
7
+ "chunk_size": 60,
8
+ "retrieval_per_phrase_k": 2,
9
+ "retrieval_per_phrase_final_k": 1,
10
+ "selection_per_phrase_k": 2,
11
+ "selection_schema": "no_why_explicit_instruction",
12
+ "structural": true,
13
+ "probe": true,
14
+ "expand_implications": true
15
+ },
16
+ "summary": {
17
+ "n": 10,
18
+ "avg_P": 0.49907896773114163,
19
+ "avg_R": 0.7686679986679986,
20
+ "avg_F1": 0.5949358584013316,
21
+ "avg_t1": 9.02932870388031,
22
+ "avg_t2": 2.8720282554626464,
23
+ "avg_t3": 4.9426744937896725,
24
+ "stage3_calls_total": 11,
25
+ "stage3_attempts_total": 11,
26
+ "stage3_parse_fail_total": 0,
27
+ "stage3_errors_total": 0,
28
+ "stage3_calls_exhausted": 0
29
+ },
30
+ "results": [
31
+ {
32
+ "id": 3285630,
33
+ "P": 0.4642857142857143,
34
+ "R": 0.9285714285714286,
35
+ "F1": 0.6190476190476191,
36
+ "n_gt": 14,
37
+ "n_sel": 28,
38
+ "t1": 9.331122159957886,
39
+ "t2": 13.36060380935669,
40
+ "t3": 6.582068920135498,
41
+ "stage3_diag": {
42
+ "calls_total": 1,
43
+ "calls_exhausted_retries": 0,
44
+ "attempts_total": 1,
45
+ "attempt_errors": 0,
46
+ "attempt_parse_fail": 0,
47
+ "attempt_parse_ok": 1
48
+ }
49
+ },
50
+ {
51
+ "id": 260449,
52
+ "P": 0.52,
53
+ "R": 0.8666666666666667,
54
+ "F1": 0.65,
55
+ "n_gt": 15,
56
+ "n_sel": 25,
57
+ "t1": 8.170901536941528,
58
+ "t2": 2.0571630001068115,
59
+ "t3": 4.041555881500244,
60
+ "stage3_diag": {
61
+ "calls_total": 1,
62
+ "calls_exhausted_retries": 0,
63
+ "attempts_total": 1,
64
+ "attempt_errors": 0,
65
+ "attempt_parse_fail": 0,
66
+ "attempt_parse_ok": 1
67
+ }
68
+ },
69
+ {
70
+ "id": 1078019,
71
+ "P": 0.6363636363636364,
72
+ "R": 1.0,
73
+ "F1": 0.7777777777777778,
74
+ "n_gt": 14,
75
+ "n_sel": 22,
76
+ "t1": 12.34386157989502,
77
+ "t2": 1.5099613666534424,
78
+ "t3": 1.325575828552246,
79
+ "stage3_diag": {
80
+ "calls_total": 1,
81
+ "calls_exhausted_retries": 0,
82
+ "attempts_total": 1,
83
+ "attempt_errors": 0,
84
+ "attempt_parse_fail": 0,
85
+ "attempt_parse_ok": 1
86
+ }
87
+ },
88
+ {
89
+ "id": 1624724,
90
+ "P": 0.3333333333333333,
91
+ "R": 0.3333333333333333,
92
+ "F1": 0.3333333333333333,
93
+ "n_gt": 6,
94
+ "n_sel": 6,
95
+ "t1": 13.713356494903564,
96
+ "t2": 0.1162874698638916,
97
+ "t3": 1.2268812656402588,
98
+ "stage3_diag": {
99
+ "calls_total": 1,
100
+ "calls_exhausted_retries": 0,
101
+ "attempts_total": 1,
102
+ "attempt_errors": 0,
103
+ "attempt_parse_fail": 0,
104
+ "attempt_parse_ok": 1
105
+ }
106
+ },
107
+ {
108
+ "id": 1325009,
109
+ "P": 0.5833333333333334,
110
+ "R": 0.6363636363636364,
111
+ "F1": 0.6086956521739131,
112
+ "n_gt": 22,
113
+ "n_sel": 24,
114
+ "t1": 8.598191976547241,
115
+ "t2": 1.8005964756011963,
116
+ "t3": 4.540030479431152,
117
+ "stage3_diag": {
118
+ "calls_total": 1,
119
+ "calls_exhausted_retries": 0,
120
+ "attempts_total": 1,
121
+ "attempt_errors": 0,
122
+ "attempt_parse_fail": 0,
123
+ "attempt_parse_ok": 1
124
+ }
125
+ },
126
+ {
127
+ "id": 1023509,
128
+ "P": 0.391304347826087,
129
+ "R": 0.6923076923076923,
130
+ "F1": 0.5,
131
+ "n_gt": 13,
132
+ "n_sel": 23,
133
+ "t1": 4.089767932891846,
134
+ "t2": 1.7381300926208496,
135
+ "t3": 3.972919464111328,
136
+ "stage3_diag": {
137
+ "calls_total": 1,
138
+ "calls_exhausted_retries": 0,
139
+ "attempts_total": 1,
140
+ "attempt_errors": 0,
141
+ "attempt_parse_fail": 0,
142
+ "attempt_parse_ok": 1
143
+ }
144
+ },
145
+ {
146
+ "id": 335343,
147
+ "P": 0.35714285714285715,
148
+ "R": 0.7142857142857143,
149
+ "F1": 0.4761904761904762,
150
+ "n_gt": 14,
151
+ "n_sel": 28,
152
+ "t1": 4.67448353767395,
153
+ "t2": 2.3359692096710205,
154
+ "t3": 8.36922836303711,
155
+ "stage3_diag": {
156
+ "calls_total": 1,
157
+ "calls_exhausted_retries": 0,
158
+ "attempts_total": 1,
159
+ "attempt_errors": 0,
160
+ "attempt_parse_fail": 0,
161
+ "attempt_parse_ok": 1
162
+ }
163
+ },
164
+ {
165
+ "id": 17482,
166
+ "P": 0.5357142857142857,
167
+ "R": 0.6818181818181818,
168
+ "F1": 0.6,
169
+ "n_gt": 22,
170
+ "n_sel": 28,
171
+ "t1": 4.954836368560791,
172
+ "t2": 1.8676352500915527,
173
+ "t3": 5.0674896240234375,
174
+ "stage3_diag": {
175
+ "calls_total": 1,
176
+ "calls_exhausted_retries": 0,
177
+ "attempts_total": 1,
178
+ "attempt_errors": 0,
179
+ "attempt_parse_fail": 0,
180
+ "attempt_parse_ok": 1
181
+ }
182
+ },
183
+ {
184
+ "id": 2021552,
185
+ "P": 0.7407407407407407,
186
+ "R": 0.8333333333333334,
187
+ "F1": 0.7843137254901961,
188
+ "n_gt": 24,
189
+ "n_sel": 27,
190
+ "t1": 8.012149810791016,
191
+ "t2": 1.6340866088867188,
192
+ "t3": 5.091134548187256,
193
+ "stage3_diag": {
194
+ "calls_total": 1,
195
+ "calls_exhausted_retries": 0,
196
+ "attempts_total": 1,
197
+ "attempt_errors": 0,
198
+ "attempt_parse_fail": 0,
199
+ "attempt_parse_ok": 1
200
+ }
201
+ },
202
+ {
203
+ "id": 2034167,
204
+ "P": 0.42857142857142855,
205
+ "R": 1.0,
206
+ "F1": 0.6,
207
+ "n_gt": 12,
208
+ "n_sel": 28,
209
+ "t1": 16.40461564064026,
210
+ "t2": 2.299849271774292,
211
+ "t3": 9.209860563278198,
212
+ "stage3_diag": {
213
+ "calls_total": 2,
214
+ "calls_exhausted_retries": 0,
215
+ "attempts_total": 2,
216
+ "attempt_errors": 0,
217
+ "attempt_parse_fail": 0,
218
+ "attempt_parse_ok": 2
219
+ }
220
+ }
221
+ ]
222
+ }
data/runtime_debug/eval_no_why_n10_20260302T210359Z.json ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-03-02T21:03:59Z",
3
+ "config": {
4
+ "dataset": "data\\eval_samples\\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl",
5
+ "n": 10,
6
+ "mode": "chunked_map_union",
7
+ "chunk_size": 60,
8
+ "retrieval_per_phrase_k": 2,
9
+ "retrieval_per_phrase_final_k": 1,
10
+ "selection_per_phrase_k": 2,
11
+ "selection_schema": "no_why",
12
+ "structural": true,
13
+ "probe": true,
14
+ "expand_implications": true
15
+ },
16
+ "summary": {
17
+ "n": 10,
18
+ "avg_P": 0.4951581196581197,
19
+ "avg_R": 0.7464818514818515,
20
+ "avg_F1": 0.5645442382676424,
21
+ "avg_t1": 9.060380721092224,
22
+ "avg_t2": 1.6346927881240845,
23
+ "avg_t3": 2.9281704664230346,
24
+ "stage3_calls_total": 11,
25
+ "stage3_attempts_total": 11,
26
+ "stage3_parse_fail_total": 0,
27
+ "stage3_errors_total": 0,
28
+ "stage3_calls_exhausted": 0
29
+ },
30
+ "results": [
31
+ {
32
+ "id": 3285630,
33
+ "P": 0.875,
34
+ "R": 0.5,
35
+ "F1": 0.6363636363636364,
36
+ "n_gt": 14,
37
+ "n_sel": 8,
38
+ "t1": 9.5723135471344,
39
+ "t2": 3.2603888511657715,
40
+ "t3": 1.1662352085113525,
41
+ "stage3_diag": {
42
+ "calls_total": 1,
43
+ "calls_exhausted_retries": 0,
44
+ "attempts_total": 1,
45
+ "attempt_errors": 0,
46
+ "attempt_parse_fail": 0,
47
+ "attempt_parse_ok": 1,
48
+ "attempts_by_n_local": {
49
+ "20": {
50
+ "attempts": 1,
51
+ "parse_ok": 1,
52
+ "parse_fail": 0,
53
+ "errors": 0
54
+ }
55
+ }
56
+ }
57
+ },
58
+ {
59
+ "id": 260449,
60
+ "P": 0.43333333333333335,
61
+ "R": 0.8666666666666667,
62
+ "F1": 0.5777777777777778,
63
+ "n_gt": 15,
64
+ "n_sel": 30,
65
+ "t1": 7.419761419296265,
66
+ "t2": 1.5911827087402344,
67
+ "t3": 2.060990333557129,
68
+ "stage3_diag": {
69
+ "calls_total": 1,
70
+ "calls_exhausted_retries": 0,
71
+ "attempts_total": 1,
72
+ "attempt_errors": 0,
73
+ "attempt_parse_fail": 0,
74
+ "attempt_parse_ok": 1,
75
+ "attempts_by_n_local": {
76
+ "20": {
77
+ "attempts": 1,
78
+ "parse_ok": 1,
79
+ "parse_fail": 0,
80
+ "errors": 0
81
+ }
82
+ }
83
+ }
84
+ },
85
+ {
86
+ "id": 1078019,
87
+ "P": 0.5555555555555556,
88
+ "R": 0.7142857142857143,
89
+ "F1": 0.6250000000000001,
90
+ "n_gt": 14,
91
+ "n_sel": 18,
92
+ "t1": 8.502500295639038,
93
+ "t2": 1.3456428050994873,
94
+ "t3": 2.0789365768432617,
95
+ "stage3_diag": {
96
+ "calls_total": 1,
97
+ "calls_exhausted_retries": 0,
98
+ "attempts_total": 1,
99
+ "attempt_errors": 0,
100
+ "attempt_parse_fail": 0,
101
+ "attempt_parse_ok": 1,
102
+ "attempts_by_n_local": {
103
+ "16": {
104
+ "attempts": 1,
105
+ "parse_ok": 1,
106
+ "parse_fail": 0,
107
+ "errors": 0
108
+ }
109
+ }
110
+ }
111
+ },
112
+ {
113
+ "id": 1624724,
114
+ "P": 0.4,
115
+ "R": 1.0,
116
+ "F1": 0.5714285714285715,
117
+ "n_gt": 6,
118
+ "n_sel": 15,
119
+ "t1": 5.102054595947266,
120
+ "t2": 1.01362943649292,
121
+ "t3": 2.029695749282837,
122
+ "stage3_diag": {
123
+ "calls_total": 1,
124
+ "calls_exhausted_retries": 0,
125
+ "attempts_total": 1,
126
+ "attempt_errors": 0,
127
+ "attempt_parse_fail": 0,
128
+ "attempt_parse_ok": 1,
129
+ "attempts_by_n_local": {
130
+ "14": {
131
+ "attempts": 1,
132
+ "parse_ok": 1,
133
+ "parse_fail": 0,
134
+ "errors": 0
135
+ }
136
+ }
137
+ }
138
+ },
139
+ {
140
+ "id": 1325009,
141
+ "P": 0.48,
142
+ "R": 0.5454545454545454,
143
+ "F1": 0.5106382978723404,
144
+ "n_gt": 22,
145
+ "n_sel": 25,
146
+ "t1": 10.626267194747925,
147
+ "t2": 1.750549554824829,
148
+ "t3": 2.414820432662964,
149
+ "stage3_diag": {
150
+ "calls_total": 1,
151
+ "calls_exhausted_retries": 0,
152
+ "attempts_total": 1,
153
+ "attempt_errors": 0,
154
+ "attempt_parse_fail": 0,
155
+ "attempt_parse_ok": 1,
156
+ "attempts_by_n_local": {
157
+ "23": {
158
+ "attempts": 1,
159
+ "parse_ok": 1,
160
+ "parse_fail": 0,
161
+ "errors": 0
162
+ }
163
+ }
164
+ }
165
+ },
166
+ {
167
+ "id": 1023509,
168
+ "P": 0.23076923076923078,
169
+ "R": 0.6923076923076923,
170
+ "F1": 0.34615384615384615,
171
+ "n_gt": 13,
172
+ "n_sel": 39,
173
+ "t1": 15.900179386138916,
174
+ "t2": 1.3188576698303223,
175
+ "t3": 3.010589361190796,
176
+ "stage3_diag": {
177
+ "calls_total": 1,
178
+ "calls_exhausted_retries": 0,
179
+ "attempts_total": 1,
180
+ "attempt_errors": 0,
181
+ "attempt_parse_fail": 0,
182
+ "attempt_parse_ok": 1,
183
+ "attempts_by_n_local": {
184
+ "20": {
185
+ "attempts": 1,
186
+ "parse_ok": 1,
187
+ "parse_fail": 0,
188
+ "errors": 0
189
+ }
190
+ }
191
+ }
192
+ },
193
+ {
194
+ "id": 335343,
195
+ "P": 0.4,
196
+ "R": 0.7142857142857143,
197
+ "F1": 0.5128205128205129,
198
+ "n_gt": 14,
199
+ "n_sel": 25,
200
+ "t1": 6.280893087387085,
201
+ "t2": 1.8548295497894287,
202
+ "t3": 2.6963677406311035,
203
+ "stage3_diag": {
204
+ "calls_total": 1,
205
+ "calls_exhausted_retries": 0,
206
+ "attempts_total": 1,
207
+ "attempt_errors": 0,
208
+ "attempt_parse_fail": 0,
209
+ "attempt_parse_ok": 1,
210
+ "attempts_by_n_local": {
211
+ "23": {
212
+ "attempts": 1,
213
+ "parse_ok": 1,
214
+ "parse_fail": 0,
215
+ "errors": 0
216
+ }
217
+ }
218
+ }
219
+ },
220
+ {
221
+ "id": 17482,
222
+ "P": 0.5769230769230769,
223
+ "R": 0.6818181818181818,
224
+ "F1": 0.6249999999999999,
225
+ "n_gt": 22,
226
+ "n_sel": 26,
227
+ "t1": 3.7739036083221436,
228
+ "t2": 1.246765375137329,
229
+ "t3": 2.3435256481170654,
230
+ "stage3_diag": {
231
+ "calls_total": 1,
232
+ "calls_exhausted_retries": 0,
233
+ "attempts_total": 1,
234
+ "attempt_errors": 0,
235
+ "attempt_parse_fail": 0,
236
+ "attempt_parse_ok": 1,
237
+ "attempts_by_n_local": {
238
+ "18": {
239
+ "attempts": 1,
240
+ "parse_ok": 1,
241
+ "parse_fail": 0,
242
+ "errors": 0
243
+ }
244
+ }
245
+ }
246
+ },
247
+ {
248
+ "id": 2021552,
249
+ "P": 0.6875,
250
+ "R": 0.9166666666666666,
251
+ "F1": 0.7857142857142857,
252
+ "n_gt": 24,
253
+ "n_sel": 32,
254
+ "t1": 11.655076026916504,
255
+ "t2": 1.3419077396392822,
256
+ "t3": 3.532601833343506,
257
+ "stage3_diag": {
258
+ "calls_total": 1,
259
+ "calls_exhausted_retries": 0,
260
+ "attempts_total": 1,
261
+ "attempt_errors": 0,
262
+ "attempt_parse_fail": 0,
263
+ "attempt_parse_ok": 1,
264
+ "attempts_by_n_local": {
265
+ "18": {
266
+ "attempts": 1,
267
+ "parse_ok": 1,
268
+ "parse_fail": 0,
269
+ "errors": 0
270
+ }
271
+ }
272
+ }
273
+ },
274
+ {
275
+ "id": 2034167,
276
+ "P": 0.3125,
277
+ "R": 0.8333333333333334,
278
+ "F1": 0.45454545454545453,
279
+ "n_gt": 12,
280
+ "n_sel": 32,
281
+ "t1": 11.7708580493927,
282
+ "t2": 1.6231741905212402,
283
+ "t3": 7.947941780090332,
284
+ "stage3_diag": {
285
+ "calls_total": 2,
286
+ "calls_exhausted_retries": 0,
287
+ "attempts_total": 2,
288
+ "attempt_errors": 0,
289
+ "attempt_parse_fail": 0,
290
+ "attempt_parse_ok": 2,
291
+ "attempts_by_n_local": {
292
+ "23": {
293
+ "attempts": 1,
294
+ "parse_ok": 1,
295
+ "parse_fail": 0,
296
+ "errors": 0
297
+ },
298
+ "1": {
299
+ "attempts": 1,
300
+ "parse_ok": 1,
301
+ "parse_fail": 0,
302
+ "errors": 0
303
+ }
304
+ }
305
+ }
306
+ }
307
+ ]
308
+ }
data/runtime_debug/false_positive_case_review_looking_anthro_bear_20260304.md ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # False-Positive Review: looking_at_viewer, anthro, bear
2
+
3
+ - Source run detail: `data\eval_results\eval_caption_cogvlm_n10_seed42_20260303_071022_detail.jsonl`
4
+ - Source eval set: `data\eval_samples\e621_sfw_sample_1000_seed123_buffer10000_caption_evident.jsonl`
5
+
6
+ ## Tag Prompt Definitions Used
7
+ - `looking_at_viewer` structural definition: Select only when explicit gaze wording appears (looking at viewer, looking at camera, looking directly at us, direct eye contact). Do not infer from front view, pose, or expression.
8
+ - `looking_at_viewer` probe display text: looking at viewer
9
+ - `anthro` structural definition: An animal/furry character with BOTH human-like body plan (upright stance, human-like torso/arms/hands) and clear animal traits (fur, muzzle, tail, animal ears, paws, or species cues).
10
+ - `anthro` probe display text: anthro - This tag is short for "anthropomorphic animals".
11
+ - `bear` structural definition: (not in structural config)
12
+ - `bear` probe display text: bear
13
+
14
+ ## Cases: false-positive `looking_at_viewer`
15
+ ### sample_id `17482`
16
+ - GT has `looking_at_viewer`: no
17
+ - Selected source for `looking_at_viewer`: structural
18
+ - Full caption: The image showcases an anthropomorphic creature, possibly a wolf or a dog, with a spade tail and claws, playing a bass guitar. The creature is depicted in a dynamic pose, with its hair flowing and fingers poised on the guitar strings. The background is a blend of pastel colors, giving the artwork a dreamy and ethereal feel. The creature's attire appears torn, and it holds the guitar with a sense of passion and dedication.
19
+ - Rewrite phrases: ['anthropomorphic creature', 'wolf or dog', 'spade tail', 'claws', 'playing bass guitar', 'dynamic pose', 'flowing hair', 'fingers on strings', 'pastel background', 'dreamy atmosphere', 'torn attire', 'passionate expression']
20
+ - Structural tags: ['solo', 'anthro', 'clothed', 'looking_at_viewer']
21
+ - Probe tags: ['solo', 'canid', 'anthro']
22
+ - Stage3-selected tags: ['claws', 'flowing_hair', 'pastel_background', 'playing_guitar', 'pose', 'spade_tail', 'torn_jeans']
23
+ - Speculation:
24
+ - Expression/pose cues may be treated as proxy for viewer-facing gaze despite explicit no-inference instruction.
25
+ - Tag came from structural stage where the classifier is biased toward common portrait framing defaults.
26
+
27
+ ### sample_id `1078019`
28
+ - GT has `looking_at_viewer`: no
29
+ - Selected source for `looking_at_viewer`: structural
30
+ - Full caption: The image showcases two anthropomorphic rabbits. The one on the left has a confident and slightly playful expression, with teal eyes and a blush on its cheeks. It's wearing a coat and holding a small plushie. The rabbit on the right appears to be more surprised or taken aback, with wide open blue eyes. Both rabbits seem to be in a close and intimate setting, suggesting a romantic or close relationship between them.
31
+ - Rewrite phrases: ['anthropomorphic rabbits', 'close relationship', 'romantic setting', 'teal eyes', 'blush cheeks', 'confident expression', 'playful expression', 'holding plushie', 'coat', 'wide open eyes', 'blue eyes', 'surprised expression']
32
+ - Structural tags: ['duo', 'anthro', 'clothed', 'looking_at_viewer']
33
+ - Probe tags: ['duo', 'clothing', 'blush', 'anthro', '<3']
34
+ - Stage3-selected tags: ['blue_eyes', 'coat', 'holding_plushie', 'plushie', 'relationship', 'teal_eyes']
35
+ - Speculation:
36
+ - Eye-related words appear, which can be over-read as direct gaze.
37
+ - Expression/pose cues may be treated as proxy for viewer-facing gaze despite explicit no-inference instruction.
38
+ - Tag came from structural stage where the classifier is biased toward common portrait framing defaults.
39
+
40
+ ### sample_id `2021552`
41
+ - GT has `looking_at_viewer`: no
42
+ - Selected source for `looking_at_viewer`: structural
43
+ - Full caption: The image showcases two anthropomorphic characters. On the left is a rabbit-like creature dressed in a white shirt and black pants, standing with crossed arms. On the right is a fox-like character wearing blue overalls and a white shirt, looking towards the rabbit with a slightly open mouth. The background is a simple grey, and both characters have distinct features such as fur, facial markings, and claws.
44
+ - Rewrite phrases: ['rabbit', 'crossed arms', 'white shirt', 'black pants', 'fox', 'blue overalls', 'white shirt', 'looking at rabbit', 'open mouth', 'grey background', 'fur', 'facial markings', 'claws']
45
+ - Structural tags: ['duo', 'anthro', 'clothed', 'looking_at_viewer']
46
+ - Probe tags: ['simple_background', 'felid', 'duo', 'clothing', 'canid', 'blush', 'anthro']
47
+ - Stage3-selected tags: ['black_pants', 'claws', 'crossed_arms', 'facial_markings', 'fur', 'grey_background', 'open_mouth', 'overalls', 'rabbit', 'shirt', 'white_shirt']
48
+ - Speculation:
49
+ - Tag came from structural stage where the classifier is biased toward common portrait framing defaults.
50
+
51
+ ### sample_id `1325009`
52
+ - GT has `looking_at_viewer`: no
53
+ - Selected source for `looking_at_viewer`: structural
54
+ - Full caption: The image showcases an anthropomorphic tiger with striking blue eyes. He is depicted in a muscular and confident pose, with one hand raised to his head in a thoughtful or playful gesture. The tiger has a white chest with a tuft of fur, and his fur is striped in the traditional tiger pattern. He is wearing dark blue shorts, and his muscular physique is accentuated by the lighting in the background, which creates a countershading effect. The overall mood of the image is one of confidence and playfulness.
55
+ - Rewrite phrases: ['anthropomorphic tiger', 'blue eyes', 'muscular pose', 'raised hand', 'white chest', 'tuft of fur', 'striped fur', 'dark blue shorts', 'countershading effect', 'confident expression', 'playful gesture', 'forest background']
56
+ - Structural tags: ['solo', 'anthro', 'male', 'clothed', 'looking_at_viewer']
57
+ - Probe tags: ['solo', 'felid', 'clothing', 'bear', 'anthro']
58
+ - Stage3-selected tags: ['blue_eyes', 'countershade_body', 'fluffy_fur', 'forest_background', 'gesture', 'raised_hand', 'shorts', 'striped_fur', 'tiger', 'white_chest']
59
+ - Speculation:
60
+ - Eye-related words appear, which can be over-read as direct gaze.
61
+ - Expression/pose cues may be treated as proxy for viewer-facing gaze despite explicit no-inference instruction.
62
+ - Tag came from structural stage where the classifier is biased toward common portrait framing defaults.
63
+
64
+ ## Cases: false-positive `anthro`
65
+ ### sample_id `1624724`
66
+ - GT has `anthro`: no
67
+ - Selected source for `anthro`: structural
68
+ - Full caption: The image showcases a cartoonish, smiling creature with large, round eyes and a prominent red nose. It has a tan body with spots and possesses a unique, crosshaped mouth. The creature appears to be floating or hovering against a simple white background.
69
+ - Rewrite phrases: ['cartoon character', 'smiling', 'large eyes', 'red nose', 'tan body', 'spots', 'cross-shaped mouth', 'floating', 'white background']
70
+ - Structural tags: ['solo', 'anthro', 'ambiguous_gender', 'topless']
71
+ - Probe tags: ['solo', 'simple_background', 'bear']
72
+ - Stage3-selected tags: ['big_eyes', 'cartoon_character', 'eyes', 'floating', 'nose', 'pink_mouth', 'red_nose', 'spots', 'tan_body', 'white_background']
73
+ - Speculation:
74
+ - Generic character/creature wording without strict body-plan cues may still trigger anthro in structural/probe stages.
75
+ - Probe list contains anthro with glossary text, increasing its prior when any animal-like terms are present.
76
+
77
+ ### sample_id `1023509`
78
+ - GT has `anthro`: no
79
+ - Selected source for `anthro`: probe
80
+ - Full caption: The image is a multi-panel comic strip. The first panel shows a character lying on the ground, surrounded by darkness, with a speech bubble saying 'I'm done for...'. The next panel depicts a hooded figure standing over the character, with a speech bubble saying 'You're not done for, you're just beginning.'. The following panels show a conversation between the hooded figure and another character, where the hooded figure mentions 'I'm the guardian of the realm of darkness'. The dialogue continues with the hooded figure expressing that the character has been chosen for a task. The final panels depict a group of characters, including a white-furred creature, a goat, a human, and a lizard, discussing a plan to 'defeat the darkness'. The comic ends with a textual note saying 'there is light'.
81
+ - Rewrite phrases: ['darkness', 'lying on ground', 'speech bubble', 'hooded figure', 'standing over', 'speech bubble', 'guardian of realm of darkness', 'chosen for task', 'white-furred creature', 'goat', 'human', 'lizard', 'defeat darkness', 'textual note', 'light']
82
+ - Structural tags: ['solo', 'duo', 'group', 'text']
83
+ - Probe tags: ['group', 'felid', 'bear', 'anthro', '<3']
84
+ - Stage3-selected tags: ['bubble', 'darkness', 'face_mask', 'figurine', 'goat', 'human', 'light', 'lizard', 'lying_on_ground', 'note', 'pear-shaped_figure', 'power_lines', 'speech_bubble', 'standing_over', 'texting', 'underground', 'unknown_species']
85
+ - Speculation:
86
+ - Generic character/creature wording without strict body-plan cues may still trigger anthro in structural/probe stages.
87
+ - Probe list contains anthro with glossary text, increasing its prior when any animal-like terms are present.
88
+
89
+ ### sample_id `335343`
90
+ - GT has `anthro`: no
91
+ - Selected source for `anthro`: probe
92
+ - Full caption: The image showcases two animated characters lying on a bed, seemingly in a resting state. The character on the left has blonde hair, green eyes, and is wearing makeup, with a slightly annoyed or disgruntled expression. The character on the right has purple hair, blue eyes, and a more relaxed or sleeping expression. Between them, there's a text that reads 'Look Before You Sleep', written in a playful font. The image also has a watermark at the bottom left corner that says 'SkyPony'. The overall color palette is dominated by shades of blue and purple, creating a serene and calming ambiance.
93
+ - Rewrite phrases: ['blonde hair', 'green eyes', 'makeup', 'annoyed expression', 'purple hair', 'blue eyes', 'sleeping expression', 'text', 'playful font', 'Look Before You Sleep', 'SkyPony watermark', 'blue and purple color palette', 'serene ambiance', 'bedroom scene', 'two characters lying down', 'resting state', 'calm atmosphere']
94
+ - Structural tags: ['duo', 'humanoid', 'text']
95
+ - Probe tags: ['text', 'simple_background', 'felid', 'duo', 'blush', 'anthro']
96
+ - Stage3-selected tags: ['annoyed_expression', 'atmosphere', 'bedroom', 'blonde_hair', 'blue_eyes', 'distracting_watermark', 'eyes', 'font', 'green_eyes', 'hair', 'lying', 'makeup', 'palette', 'playful', 'purple_hair', 'purple_membrane', 'resting', 'romantic_ambiance', 'sleeping', 'stats', 'text', 'walking', 'watermark']
97
+ - Speculation:
98
+ - Generic character/creature wording without strict body-plan cues may still trigger anthro in structural/probe stages.
99
+ - Probe list contains anthro with glossary text, increasing its prior when any animal-like terms are present.
100
+
101
+ ### sample_id `2034167`
102
+ - GT has `anthro`: no
103
+ - Selected source for `anthro`: structural, probe
104
+ - Full caption: The image showcases a vibrant, animated character that appears to be a fusion of a canine and a humanoid. The character has striking blue eyes, a blue nose, and a purple body with white fur. The character's fur is adorned with vivid pink and blue stripes, and it has a playful, open-mouthed expression. The character's tail is long and curved, with a mix of blue and pink hues. The background is simple, allowing the character to be the focal point. The character appears to be in a dynamic pose, possibly mid-stride or jump.
105
+ - Rewrite phrases: ['blue eyes', 'purple body', 'white fur', 'pink and blue stripes', 'long curved tail', 'blue and pink tail', 'open mouth', 'dynamic pose', 'simple background', 'animated character', 'canine humanoid', 'blue nose']
106
+ - Structural tags: ['solo', 'anthro', 'ambiguous_gender', 'topless']
107
+ - Probe tags: ['solo', 'simple_background', 'canid', 'anthro']
108
+ - Stage3-selected tags: ['action_pose', 'animated_png', 'blue_eyes', 'blue_nose', 'canine_humanoid', 'curved_tail', 'eyes', 'fur', 'half-length_portrait', 'humanoid', 'invalid_background', 'nose', 'open_mouth', 'pink_stripes', 'pink_tail', 'pose', 'purple_body', 'simple_background', 'stripes', 'tail', 'white_fur']
109
+ - Speculation:
110
+ - Generic character/creature wording without strict body-plan cues may still trigger anthro in structural/probe stages.
111
+ - Probe list contains anthro with glossary text, increasing its prior when any animal-like terms are present.
112
+
113
+ ### sample_id `260449`
114
+ - GT has `anthro`: no
115
+ - Selected source for `anthro`: structural, probe
116
+ - Full caption: The image showcases a group of animated characters. On the left, there's a large, jovial ape with a wide grin, raised arms, and a playful expression. In the center, a large, jovial bear is seen laughing and playfully interacting with a young boy, who is dancing with his arms raised. The boy has a cheerful expression and is wearing a loincloth. On the right, there's a smaller, mischievous-looking primate with a tuft of hair on its head, looking directly at the viewer with a cheeky grin. The background is simple, emphasizing the characters.
117
+ - Rewrite phrases: ['ape', 'raised arms', 'wide grin', 'playful expression', 'bear', 'laughing', 'interacting with boy', 'boy', 'dancing', 'arms raised', 'cheerful expression', 'loincloth', 'primate', 'tuft of hair', 'looking at viewer', 'cheeky grin', 'simple background']
118
+ - Structural tags: ['trio', 'anthro', 'feral', 'male', 'clothed', 'topless', 'looking_at_viewer']
119
+ - Probe tags: ['simple_background', 'group', 'duo', 'bear', 'anthro']
120
+ - Stage3-selected tags: ['ape', 'bear', 'cheeky', 'dancing', 'grin', 'hair', 'laugh', 'loincloth', 'looking_at_viewer', 'male', 'primate', 'raised_arm', 'raised_arms', 'simple_background', 'wide_grin']
121
+ - Speculation:
122
+ - Generic character/creature wording without strict body-plan cues may still trigger anthro in structural/probe stages.
123
+ - Probe list contains anthro with glossary text, increasing its prior when any animal-like terms are present.
124
+
125
+ ## Cases: false-positive `bear`
126
+ ### sample_id `1624724`
127
+ - GT has `bear`: no
128
+ - Selected source for `bear`: probe
129
+ - Full caption: The image showcases a cartoonish, smiling creature with large, round eyes and a prominent red nose. It has a tan body with spots and possesses a unique, crosshaped mouth. The creature appears to be floating or hovering against a simple white background.
130
+ - Rewrite phrases: ['cartoon character', 'smiling', 'large eyes', 'red nose', 'tan body', 'spots', 'cross-shaped mouth', 'floating', 'white background']
131
+ - Structural tags: ['solo', 'anthro', 'ambiguous_gender', 'topless']
132
+ - Probe tags: ['solo', 'simple_background', 'bear']
133
+ - Stage3-selected tags: ['big_eyes', 'cartoon_character', 'eyes', 'floating', 'nose', 'pink_mouth', 'red_nose', 'spots', 'tan_body', 'white_background']
134
+ - Speculation:
135
+ - Broad animal appearance cues can match bear weakly when species is underspecified.
136
+ - Bear is injected by probe stage as a standalone species guess (not structural), so one mistaken probe decision adds it directly.
137
+
138
+ ### sample_id `1023509`
139
+ - GT has `bear`: no
140
+ - Selected source for `bear`: probe
141
+ - Full caption: The image is a multi-panel comic strip. The first panel shows a character lying on the ground, surrounded by darkness, with a speech bubble saying 'I'm done for...'. The next panel depicts a hooded figure standing over the character, with a speech bubble saying 'You're not done for, you're just beginning.'. The following panels show a conversation between the hooded figure and another character, where the hooded figure mentions 'I'm the guardian of the realm of darkness'. The dialogue continues with the hooded figure expressing that the character has been chosen for a task. The final panels depict a group of characters, including a white-furred creature, a goat, a human, and a lizard, discussing a plan to 'defeat the darkness'. The comic ends with a textual note saying 'there is light'.
142
+ - Rewrite phrases: ['darkness', 'lying on ground', 'speech bubble', 'hooded figure', 'standing over', 'speech bubble', 'guardian of realm of darkness', 'chosen for task', 'white-furred creature', 'goat', 'human', 'lizard', 'defeat darkness', 'textual note', 'light']
143
+ - Structural tags: ['solo', 'duo', 'group', 'text']
144
+ - Probe tags: ['group', 'felid', 'bear', 'anthro', '<3']
145
+ - Stage3-selected tags: ['bubble', 'darkness', 'face_mask', 'figurine', 'goat', 'human', 'light', 'lizard', 'lying_on_ground', 'note', 'pear-shaped_figure', 'power_lines', 'speech_bubble', 'standing_over', 'texting', 'underground', 'unknown_species']
146
+ - Speculation:
147
+ - Broad animal appearance cues can match bear weakly when species is underspecified.
148
+ - Bear is injected by probe stage as a standalone species guess (not structural), so one mistaken probe decision adds it directly.
149
+
150
+ ### sample_id `1325009`
151
+ - GT has `bear`: no
152
+ - Selected source for `bear`: probe
153
+ - Full caption: The image showcases an anthropomorphic tiger with striking blue eyes. He is depicted in a muscular and confident pose, with one hand raised to his head in a thoughtful or playful gesture. The tiger has a white chest with a tuft of fur, and his fur is striped in the traditional tiger pattern. He is wearing dark blue shorts, and his muscular physique is accentuated by the lighting in the background, which creates a countershading effect. The overall mood of the image is one of confidence and playfulness.
154
+ - Rewrite phrases: ['anthropomorphic tiger', 'blue eyes', 'muscular pose', 'raised hand', 'white chest', 'tuft of fur', 'striped fur', 'dark blue shorts', 'countershading effect', 'confident expression', 'playful gesture', 'forest background']
155
+ - Structural tags: ['solo', 'anthro', 'male', 'clothed', 'looking_at_viewer']
156
+ - Probe tags: ['solo', 'felid', 'clothing', 'bear', 'anthro']
157
+ - Stage3-selected tags: ['blue_eyes', 'countershade_body', 'fluffy_fur', 'forest_background', 'gesture', 'raised_hand', 'shorts', 'striped_fur', 'tiger', 'white_chest']
158
+ - Speculation:
159
+ - Bear is injected by probe stage as a standalone species guess (not structural), so one mistaken probe decision adds it directly.
data/runtime_debug/llm_capture_20260302T162119Z/input_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup
data/runtime_debug/llm_capture_20260302T162202Z/input_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup
data/runtime_debug/llm_capture_20260302T162202Z/structural_request.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "messages": [
3
+ {
4
+ "role": "system",
5
+ "content": "You classify image descriptions by selecting true statements from a numbered list.\n\nThe statements are organized into GROUPS. Each group header tells you how many to pick:\n- \"pick EXACTLY ONE\" = choose the single best match in that group\n- \"pick ALL that apply\" = choose every statement that is true\n\nIMPORTANT RULES:\n1. ONLY select a statement if the description directly says it or makes it very obvious.\n2. Do NOT guess or assume things the description does not mention.\n3. For body type: \"anthro\" means an ANIMAL with a human-shaped body (walks upright, has hands, but still has fur/tail/muzzle). \"humanoid\" means HUMAN or human-like with NO animal features. A wolf standing on two legs = anthro, NOT humanoid.\n4. For gender: only select male/female/intersex when there is explicit textual evidence (such as gender words or pronouns). Do not infer gender from species, body shape, clothing, or style. If no reliable gender cue is present, do not select male/female/intersex; use ambiguous_gender instead.\n5. For clothing state: READ CAREFULLY! \"topless\" = bare chest, wearing pants. \"bottomless\" = wearing shirt, no pants. If unsure, re-read the description.\n6. If clothing is not mentioned, do NOT pick any clothing statement.\n\nReturn JSON ONLY:\n{\"selections\": [{\"i\": 1}, {\"i\": 5}]}\n\nEXAMPLE:\nDescription: \"A muscular male wolf standing in a forest, wearing jeans, giving a thumbs up\"\nAnswer: {\"selections\": [{\"i\": 2}, {\"i\": 6}, {\"i\": 10}, {\"i\": 14}]}\nWhy: One character = solo (2). Wolf standing upright with hands = anthro (6), NOT humanoid because it is a wolf. Male (10). Wearing jeans = clothed (14)."
6
+ },
7
+ {
8
+ "role": "human",
9
+ "content": "Read this image description and select which statements are true.\n\nIMAGE DESCRIPTION:\nA young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup\n\nSTATEMENTS (pick by number):\n--- CHARACTER COUNT (pick EXACTLY ONE) ---\n1. No characters or living beings are visible in the image.\n2. Exactly one character is visible in the image.\n3. Exactly two characters are visible in the image.\n4. Exactly three characters are visible in the image; select only when the count is clearly three.\n5. Four or more characters are visible in the image; do not use for one, two, or three.\n\n--- BODY TYPE (pick ALL that apply) ---\n6. An animal/furry character with BOTH human-like body plan (upright stance, human-like torso/arms/hands) and clear animal traits (fur, muzzle, tail, animal ears, paws, or species cues).\n7. A non-humanoid animal body plan (typically quadruped or otherwise animal-shaped, without human-like torso/hands). Do not select if explicitly anthropomorphic.\n8. A human or near-human character with no explicit animal-species traits. Do not select if animal species words or animal traits (muzzle, tail, paws, animal ears, heavy fur coat, scales) are present.\n9. Select only for an explicit centaur-like body plan: a humanoid upper torso attached to a separate four-legged lower body.\n\n--- GENDER (pick ALL that apply) ---\n10. Select only when the description explicitly indicates male presentation or identity, such as male/man/boy/he/him/his/father/husband/boyfriend. 'boy' and male pronouns count as explicit evidence.\n11. Select only when the description explicitly indicates female presentation or identity, such as female/woman/girl/she/her/hers/mother/wife/girlfriend. 'girl' and female pronouns count as explicit evidence.\n12. Select only when the description explicitly says gender is unknown, ambiguous, androgynous, mixed, or not determinable. Do not use this as a default fallback when gender is simply unmentioned.\n13. Select only when intersex or mixed-sex-traits wording is explicit in the description.\n\n--- CLOTHING STATE (pick ALL that apply) ---\n14. At least one character is explicitly described as wearing clothing or a garment (for example shirt, pants, shorts, dress, coat, loincloth, armor, uniform).\n15. Select only when the description explicitly indicates no clothing (nude/naked/unclothed). Do not infer nude just because clothing is not mentioned.\n16. The upper body/chest is uncovered while lower body has clothing. This includes descriptions with shorts/pants/loincloth and no shirt/top.\n17. The lower body is uncovered while the upper body has clothing.\n\n--- VISUAL ELEMENTS (pick ALL that apply) ---\n18. Select only when explicit gaze wording appears (looking at viewer, looking at camera, looking directly at us, direct eye contact). Do not infer from front view, pose, or expression.\n19. Visible written text, dialogue, signs, or lettering appear in the image.\n"
10
+ }
11
+ ],
12
+ "n_statements": 19
13
+ }
data/runtime_debug/llm_capture_20260302T162202Z/structural_response_parsed.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "selections": [
3
+ {
4
+ "i": 2
5
+ },
6
+ {
7
+ "i": 6
8
+ },
9
+ {
10
+ "i": 10
11
+ },
12
+ {
13
+ "i": 14
14
+ }
15
+ ]
16
+ }
data/runtime_debug/llm_capture_20260302T162202Z/structural_response_raw.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ {"selections": [{"i": 2}, {"i": 6}, {"i": 10}, {"i": 14}]}
data/runtime_debug/llm_capture_20260302T162249Z/input_prompt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ A young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup
data/runtime_debug/llm_capture_20260302T162249Z/probe_request.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "messages": [
3
+ {
4
+ "role": "system",
5
+ "content": "You are given a description of an image and a list of imageboard tags.\n\nSelect the tags that correspond to content that would be visible or depicted in the described image.\n\nThe list contains only valid tags; many of them are irrelevant to the image.\n\nReturn JSON ONLY matching this schema:\n\n{\n \"selections\": [\n {\"i\": <int>, \"why\": \"<one of: explicit|strong_implied|weak_implied|style_or_meta|other>\"},\n ...\n ]\n}\n\nRules:\n- Choose ONLY from indices 1..13.\n- Do NOT output tag text.\n- Do NOT output any keys other than \"selections\", and inside each item only the item index \"i\" and \"why\".\n- Do select both a general tag and a more specific tag when both apply (for example, \"shirt\" and \"grey shirt\").\n\nDefine \"why\" as:\n- explicit: directly stated in the image description\n- strong_implied: very likely given the description, even if not literally stated\n- weak_implied: plausible but not strongly supported by the description\n- style_or_meta: stylistic or presentation-related tags only if clearly indicated\n- other: fallback category; use sparingly\n"
6
+ },
7
+ {
8
+ "role": "human",
9
+ "content": "IMAGE DESCRIPTION:\nA young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup\n\nCANDIDATES (choose by index only):\n1. clothing\n2. simple background\n3. anthro - This tag is short for \"anthropomorphic animals\".\n4. text\n5. blush\n6. canid - The entirety of the dog family, Canidae.\n7. solo - When a scene only features one character.\n8. duo - Tagged for posts that feature two characters, no more and no less than two.\n9. group\n10. felid - The entirety of the cat family.\n11. bird\n12. bear\n13. <3 - The symbol of love, and pleasure.\n\nSelect up to 13 indices. Output fewer if uncertain.\n"
10
+ }
11
+ ],
12
+ "n_candidates": 13,
13
+ "per_call_budget": 13
14
+ }
data/runtime_debug/llm_capture_20260302T162249Z/probe_response_parsed.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "selections": [
3
+ {
4
+ "i": 3,
5
+ "why": "explicit"
6
+ },
7
+ {
8
+ "i": 7,
9
+ "why": "explicit"
10
+ },
11
+ {
12
+ "i": 1,
13
+ "why": "explicit"
14
+ },
15
+ {
16
+ "i": 13,
17
+ "why": "explicit"
18
+ },
19
+ {
20
+ "i": 6,
21
+ "why": "explicit"
22
+ },
23
+ {
24
+ "i": 4,
25
+ "why": "style_or_meta"
26
+ }
27
+ ]
28
+ }
data/runtime_debug/llm_capture_20260302T162249Z/probe_response_raw.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "selections": [
3
+ {"i": 3, "why": "explicit"},
4
+ {"i": 7, "why": "explicit"},
5
+ {"i": 1, "why": "explicit"},
6
+ {"i": 13, "why": "explicit"},
7
+ {"i": 6, "why": "explicit"},
8
+ {"i": 4, "why": "style_or_meta"}
9
+ ]
10
+ }
data/runtime_debug/llm_capture_20260302T162249Z/selection_request.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "messages": [
3
+ {
4
+ "role": "system",
5
+ "content": "You are given a description of an image and a list of imageboard tags.\n\nSelect the tags that correspond to content that would be visible or depicted in the described image.\n\nThe list contains only valid tags; many of them are irrelevant to the image.\n\nReturn JSON ONLY matching this schema:\n\n{\n \"selections\": [\n {\"i\": <int>, \"why\": \"<one of: explicit|strong_implied|weak_implied|style_or_meta|other>\"},\n ...\n ]\n}\n\nRules:\n- Choose ONLY from indices 1..19.\n- Do NOT output tag text.\n- Do NOT output any keys other than \"selections\", and inside each item only the item index \"i\" and \"why\".\n- Do select both a general tag and a more specific tag when both apply (for example, \"shirt\" and \"grey shirt\").\n\nDefine \"why\" as:\n- explicit: directly stated in the image description\n- strong_implied: very likely given the description, even if not literally stated\n- weak_implied: plausible but not strongly supported by the description\n- style_or_meta: stylistic or presentation-related tags only if clearly indicated\n- other: fallback category; use sparingly\n"
6
+ },
7
+ {
8
+ "role": "human",
9
+ "content": "IMAGE DESCRIPTION:\nIMAGE DESCRIPTION: A young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup\nREWRITE PHRASES: young male, anthro fox, red fur, white chest fluff, black hoodie, jeans, standing, city street, night, looking at viewer, slight smile, holding coffee cup\nINFERRED TAG HINTS (context only): <3, anthro, canid, clothed, clothing, looking_at_viewer, male, solo\n\nCANDIDATES (choose by index only):\n1. black hoodie\n2. hoodie\n3. jeans\n4. young male\n5. street\n6. holding coffee cup\n7. cup\n8. fluffy\n9. standing\n10. fur\n11. slight smile\n12. smile\n13. night\n14. looking at viewer\n15. male\n16. viewer\n17. fox\n18. red fur\n19. white inner ear fluff\n\nSelect up to 42 indices. Output fewer if uncertain.\n"
10
+ }
11
+ ],
12
+ "n_candidates": 19,
13
+ "per_call_budget": 42,
14
+ "mode": "chunked_map_union",
15
+ "chunk_size": 60,
16
+ "selection_per_phrase_k": 2,
17
+ "retrieved_candidate_tags": [
18
+ "black_hoodie",
19
+ "hoodie",
20
+ "jeans",
21
+ "young_male",
22
+ "street",
23
+ "holding_coffee_cup",
24
+ "cup",
25
+ "fluffy",
26
+ "standing",
27
+ "fur",
28
+ "slight_smile",
29
+ "smile",
30
+ "night",
31
+ "looking_at_viewer",
32
+ "male",
33
+ "viewer",
34
+ "fox",
35
+ "red_fur",
36
+ "white_inner_ear_fluff"
37
+ ]
38
+ }
data/runtime_debug/llm_capture_20260302T162249Z/selection_response_parsed.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "selections": []
3
+ }
data/runtime_debug/llm_capture_20260302T162249Z/selection_response_raw.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ {"selections":[]}
data/runtime_debug/llm_capture_20260302T162249Z/structural_request.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "messages": [
3
+ {
4
+ "role": "system",
5
+ "content": "You classify image descriptions by selecting true statements from a numbered list.\n\nThe statements are organized into GROUPS. Each group header tells you how many to pick:\n- \"pick EXACTLY ONE\" = choose the single best match in that group\n- \"pick ALL that apply\" = choose every statement that is true\n\nIMPORTANT RULES:\n1. ONLY select a statement if the description directly says it or makes it very obvious.\n2. Do NOT guess or assume things the description does not mention.\n3. For body type: \"anthro\" means an ANIMAL with a human-shaped body (walks upright, has hands, but still has fur/tail/muzzle). \"humanoid\" means HUMAN or human-like with NO animal features. A wolf standing on two legs = anthro, NOT humanoid.\n4. For gender: only select male/female/intersex when there is explicit textual evidence (such as gender words or pronouns). Do not infer gender from species, body shape, clothing, or style. If no reliable gender cue is present, do not select male/female/intersex; use ambiguous_gender instead.\n5. For clothing state: READ CAREFULLY! \"topless\" = bare chest, wearing pants. \"bottomless\" = wearing shirt, no pants. If unsure, re-read the description.\n6. If clothing is not mentioned, do NOT pick any clothing statement.\n\nReturn JSON ONLY:\n{\"selections\": [{\"i\": 1}, {\"i\": 5}]}\n\nEXAMPLE:\nDescription: \"A muscular male wolf standing in a forest, wearing jeans, giving a thumbs up\"\nAnswer: {\"selections\": [{\"i\": 2}, {\"i\": 6}, {\"i\": 10}, {\"i\": 14}]}\nWhy: One character = solo (2). Wolf standing upright with hands = anthro (6), NOT humanoid because it is a wolf. Male (10). Wearing jeans = clothed (14)."
6
+ },
7
+ {
8
+ "role": "human",
9
+ "content": "Read this image description and select which statements are true.\n\nIMAGE DESCRIPTION:\nA young male anthro fox with red fur and white chest fluff wearing a black hoodie and jeans, standing in a city street at night, looking at viewer, slight smile, holding a coffee cup\n\nSTATEMENTS (pick by number):\n--- CHARACTER COUNT (pick EXACTLY ONE) ---\n1. No characters or living beings are visible in the image.\n2. Exactly one character is visible in the image.\n3. Exactly two characters are visible in the image.\n4. Exactly three characters are visible in the image; select only when the count is clearly three.\n5. Four or more characters are visible in the image; do not use for one, two, or three.\n\n--- BODY TYPE (pick ALL that apply) ---\n6. An animal/furry character with BOTH human-like body plan (upright stance, human-like torso/arms/hands) and clear animal traits (fur, muzzle, tail, animal ears, paws, or species cues).\n7. A non-humanoid animal body plan (typically quadruped or otherwise animal-shaped, without human-like torso/hands). Do not select if explicitly anthropomorphic.\n8. A human or near-human character with no explicit animal-species traits. Do not select if animal species words or animal traits (muzzle, tail, paws, animal ears, heavy fur coat, scales) are present.\n9. Select only for an explicit centaur-like body plan: a humanoid upper torso attached to a separate four-legged lower body.\n\n--- GENDER (pick ALL that apply) ---\n10. Select only when the description explicitly indicates male presentation or identity, such as male/man/boy/he/him/his/father/husband/boyfriend. 'boy' and male pronouns count as explicit evidence.\n11. Select only when the description explicitly indicates female presentation or identity, such as female/woman/girl/she/her/hers/mother/wife/girlfriend. 'girl' and female pronouns count as explicit evidence.\n12. Select only when the description explicitly says gender is unknown, ambiguous, androgynous, mixed, or not determinable. Do not use this as a default fallback when gender is simply unmentioned.\n13. Select only when intersex or mixed-sex-traits wording is explicit in the description.\n\n--- CLOTHING STATE (pick ALL that apply) ---\n14. At least one character is explicitly described as wearing clothing or a garment (for example shirt, pants, shorts, dress, coat, loincloth, armor, uniform).\n15. Select only when the description explicitly indicates no clothing (nude/naked/unclothed). Do not infer nude just because clothing is not mentioned.\n16. The upper body/chest is uncovered while lower body has clothing. This includes descriptions with shorts/pants/loincloth and no shirt/top.\n17. The lower body is uncovered while the upper body has clothing.\n\n--- VISUAL ELEMENTS (pick ALL that apply) ---\n18. Select only when explicit gaze wording appears (looking at viewer, looking at camera, looking directly at us, direct eye contact). Do not infer from front view, pose, or expression.\n19. Visible written text, dialogue, signs, or lettering appear in the image.\n"
10
+ }
11
+ ],
12
+ "n_statements": 19
13
+ }
data/runtime_debug/llm_capture_20260302T162249Z/structural_response_parsed.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "selections": [
3
+ {
4
+ "i": 2
5
+ },
6
+ {
7
+ "i": 6
8
+ },
9
+ {
10
+ "i": 10
11
+ },
12
+ {
13
+ "i": 14
14
+ },
15
+ {
16
+ "i": 18
17
+ }
18
+ ]
19
+ }
data/runtime_debug/llm_capture_20260302T162249Z/structural_response_raw.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ {"selections": [{"i": 2}, {"i": 6}, {"i": 10}, {"i": 14}, {"i": 18}]}
data/runtime_debug/llm_capture_20260302T162249Z/summary.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "rewritten": "young male, anthro fox, red fur, white chest fluff, black hoodie, jeans, standing, city street, night, looking at viewer, slight smile, holding coffee cup",
3
+ "structural_tags": [
4
+ "solo",
5
+ "anthro",
6
+ "male",
7
+ "clothed",
8
+ "looking_at_viewer"
9
+ ],
10
+ "probe_tags": [
11
+ "anthro",
12
+ "solo",
13
+ "clothing",
14
+ "<3",
15
+ "canid"
16
+ ],
17
+ "n_retrieved_candidates": 19,
18
+ "retrieved_candidates": [
19
+ "black_hoodie",
20
+ "hoodie",
21
+ "jeans",
22
+ "young_male",
23
+ "street",
24
+ "holding_coffee_cup",
25
+ "cup",
26
+ "fluffy",
27
+ "standing",
28
+ "fur",
29
+ "slight_smile",
30
+ "smile",
31
+ "night",
32
+ "looking_at_viewer",
33
+ "male",
34
+ "viewer",
35
+ "fox",
36
+ "red_fur",
37
+ "white_inner_ear_fluff"
38
+ ],
39
+ "files": [
40
+ "input_prompt.txt",
41
+ "probe_request.json",
42
+ "probe_response_parsed.json",
43
+ "probe_response_raw.txt",
44
+ "selection_request.json",
45
+ "selection_response_parsed.json",
46
+ "selection_response_raw.txt",
47
+ "structural_request.json",
48
+ "structural_response_parsed.json",
49
+ "structural_response_raw.txt"
50
+ ]
51
+ }
data/runtime_debug/selection_why_vs_no_why_20260302T191813Z.json ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-03-02T19:15:40Z",
3
+ "model": "meta-llama/llama-3.1-8b-instruct",
4
+ "n_samples": 10,
5
+ "results": [
6
+ {
7
+ "id": 3285630,
8
+ "n_candidates": 18,
9
+ "N": 18,
10
+ "per_call_budget": 38,
11
+ "with_why": {
12
+ "label": "with_why",
13
+ "ok": true,
14
+ "failure_type": "ok",
15
+ "error": null,
16
+ "latency_s": 0.9805651999922702
17
+ },
18
+ "no_why": {
19
+ "label": "no_why",
20
+ "ok": true,
21
+ "failure_type": "ok",
22
+ "error": null,
23
+ "latency_s": 2.9448838000098476
24
+ }
25
+ },
26
+ {
27
+ "id": 260449,
28
+ "n_candidates": 30,
29
+ "N": 30,
30
+ "per_call_budget": 74,
31
+ "with_why": {
32
+ "label": "with_why",
33
+ "ok": true,
34
+ "failure_type": "ok",
35
+ "error": null,
36
+ "latency_s": 5.376112800004194
37
+ },
38
+ "no_why": {
39
+ "label": "no_why",
40
+ "ok": true,
41
+ "failure_type": "ok",
42
+ "error": null,
43
+ "latency_s": 1.4645917000016198
44
+ }
45
+ },
46
+ {
47
+ "id": 1078019,
48
+ "n_candidates": 16,
49
+ "N": 16,
50
+ "per_call_budget": 38,
51
+ "with_why": {
52
+ "label": "with_why",
53
+ "ok": true,
54
+ "failure_type": "ok",
55
+ "error": null,
56
+ "latency_s": 6.614833399988129
57
+ },
58
+ "no_why": {
59
+ "label": "no_why",
60
+ "ok": true,
61
+ "failure_type": "ok",
62
+ "error": null,
63
+ "latency_s": 0.9294685000058962
64
+ }
65
+ },
66
+ {
67
+ "id": 1624724,
68
+ "n_candidates": 17,
69
+ "N": 17,
70
+ "per_call_budget": 36,
71
+ "with_why": {
72
+ "label": "with_why",
73
+ "ok": true,
74
+ "failure_type": "ok",
75
+ "error": null,
76
+ "latency_s": 3.396455699999933
77
+ },
78
+ "no_why": {
79
+ "label": "no_why",
80
+ "ok": true,
81
+ "failure_type": "ok",
82
+ "error": null,
83
+ "latency_s": 2.18461219999881
84
+ }
85
+ },
86
+ {
87
+ "id": 1325009,
88
+ "n_candidates": 21,
89
+ "N": 21,
90
+ "per_call_budget": 46,
91
+ "with_why": {
92
+ "label": "with_why",
93
+ "ok": true,
94
+ "failure_type": "ok",
95
+ "error": null,
96
+ "latency_s": 5.769819000008283
97
+ },
98
+ "no_why": {
99
+ "label": "no_why",
100
+ "ok": true,
101
+ "failure_type": "ok",
102
+ "error": null,
103
+ "latency_s": 2.0555457000009483
104
+ }
105
+ },
106
+ {
107
+ "id": 1023509,
108
+ "n_candidates": 23,
109
+ "N": 23,
110
+ "per_call_budget": 58,
111
+ "with_why": {
112
+ "label": "with_why",
113
+ "ok": true,
114
+ "failure_type": "ok",
115
+ "error": null,
116
+ "latency_s": 5.662558600000921
117
+ },
118
+ "no_why": {
119
+ "label": "no_why",
120
+ "ok": true,
121
+ "failure_type": "ok",
122
+ "error": null,
123
+ "latency_s": 5.300095500002499
124
+ }
125
+ },
126
+ {
127
+ "id": 335343,
128
+ "n_candidates": 26,
129
+ "N": 26,
130
+ "per_call_budget": 56,
131
+ "with_why": {
132
+ "label": "with_why",
133
+ "ok": true,
134
+ "failure_type": "ok",
135
+ "error": null,
136
+ "latency_s": 1.9454404999996768
137
+ },
138
+ "no_why": {
139
+ "label": "no_why",
140
+ "ok": true,
141
+ "failure_type": "ok",
142
+ "error": null,
143
+ "latency_s": 4.1100776999956
144
+ }
145
+ },
146
+ {
147
+ "id": 17482,
148
+ "n_candidates": 14,
149
+ "N": 14,
150
+ "per_call_budget": 28,
151
+ "with_why": {
152
+ "label": "with_why",
153
+ "ok": true,
154
+ "failure_type": "ok",
155
+ "error": null,
156
+ "latency_s": 3.7595577000029152
157
+ },
158
+ "no_why": {
159
+ "label": "no_why",
160
+ "ok": true,
161
+ "failure_type": "ok",
162
+ "error": null,
163
+ "latency_s": 3.8396145999868168
164
+ }
165
+ },
166
+ {
167
+ "id": 2021552,
168
+ "n_candidates": 19,
169
+ "N": 19,
170
+ "per_call_budget": 42,
171
+ "with_why": {
172
+ "label": "with_why",
173
+ "ok": true,
174
+ "failure_type": "ok",
175
+ "error": null,
176
+ "latency_s": 4.009010099995066
177
+ },
178
+ "no_why": {
179
+ "label": "no_why",
180
+ "ok": true,
181
+ "failure_type": "ok",
182
+ "error": null,
183
+ "latency_s": 3.4596964999946067
184
+ }
185
+ },
186
+ {
187
+ "id": 2034167,
188
+ "n_candidates": 27,
189
+ "N": 27,
190
+ "per_call_budget": 58,
191
+ "with_why": {
192
+ "label": "with_why",
193
+ "ok": true,
194
+ "failure_type": "ok",
195
+ "error": null,
196
+ "latency_s": 2.2680347000132315
197
+ },
198
+ "no_why": {
199
+ "label": "no_why",
200
+ "ok": true,
201
+ "failure_type": "ok",
202
+ "error": null,
203
+ "latency_s": 6.154438600002322
204
+ }
205
+ }
206
+ ],
207
+ "summary": {
208
+ "with_why": {
209
+ "ok": 10
210
+ },
211
+ "no_why": {
212
+ "ok": 10
213
+ },
214
+ "avg_latency_s_with_why": 3.9782387700004618,
215
+ "avg_latency_s_no_why": 3.2443024799998965
216
+ }
217
+ }
data/runtime_debug/whyless_replication_seeds_42_43_20260303T060318Z.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp_utc": "2026-03-03T05:51:00Z",
3
+ "seeds": [
4
+ 42,
5
+ 43
6
+ ],
7
+ "variants": {
8
+ "with_why_explicit": {
9
+ "by_seed": {
10
+ "42": {
11
+ "n": 10,
12
+ "P": 0.5163875544168061,
13
+ "R": 0.6743697968697968,
14
+ "F1": 0.5748467389701281,
15
+ "t3_s": 8.31660475730896,
16
+ "attempts_total": 0,
17
+ "parse_fail_total": 0,
18
+ "errors_total": 0,
19
+ "calls_total": 0,
20
+ "calls_exhausted": 0
21
+ },
22
+ "43": {
23
+ "n": 10,
24
+ "P": 0.5410290148448043,
25
+ "R": 0.5713611388611388,
26
+ "F1": 0.5253254924582543,
27
+ "t3_s": 10.785722708702087,
28
+ "attempts_total": 0,
29
+ "parse_fail_total": 0,
30
+ "errors_total": 0,
31
+ "calls_total": 0,
32
+ "calls_exhausted": 0
33
+ }
34
+ },
35
+ "avg": {
36
+ "P": 0.5287082846308051,
37
+ "R": 0.6228654678654678,
38
+ "F1": 0.5500861157141912,
39
+ "t3_s": 9.551163733005524,
40
+ "parse_fail_total": 0,
41
+ "errors_total": 0,
42
+ "attempts_total": 0,
43
+ "calls_exhausted": 0
44
+ }
45
+ },
46
+ "no_why": {
47
+ "by_seed": {
48
+ "42": {
49
+ "n": 10,
50
+ "P": 0.526974765974766,
51
+ "R": 0.6552372627372628,
52
+ "F1": 0.5574913389066418,
53
+ "t3_s": 3.2179250478744508,
54
+ "attempts_total": 11,
55
+ "parse_fail_total": 0,
56
+ "errors_total": 0,
57
+ "calls_total": 11,
58
+ "calls_exhausted": 0
59
+ },
60
+ "43": {
61
+ "n": 10,
62
+ "P": 0.5616956032473274,
63
+ "R": 0.6611355311355311,
64
+ "F1": 0.5434100633858101,
65
+ "t3_s": 10.192648196220398,
66
+ "attempts_total": 11,
67
+ "parse_fail_total": 0,
68
+ "errors_total": 0,
69
+ "calls_total": 11,
70
+ "calls_exhausted": 0
71
+ }
72
+ },
73
+ "avg": {
74
+ "P": 0.5443351846110467,
75
+ "R": 0.658186396936397,
76
+ "F1": 0.550450701146226,
77
+ "t3_s": 6.705286622047424,
78
+ "parse_fail_total": 0,
79
+ "errors_total": 0,
80
+ "attempts_total": 22,
81
+ "calls_exhausted": 0
82
+ }
83
+ },
84
+ "no_why_explicit_instruction": {
85
+ "by_seed": {
86
+ "42": {
87
+ "n": 10,
88
+ "P": 0.5039213382541718,
89
+ "R": 0.7805727605727606,
90
+ "F1": 0.5978319552325569,
91
+ "t3_s": 5.193069648742676,
92
+ "attempts_total": 11,
93
+ "parse_fail_total": 0,
94
+ "errors_total": 0,
95
+ "calls_total": 11,
96
+ "calls_exhausted": 0
97
+ },
98
+ "43": {
99
+ "n": 10,
100
+ "P": 0.5479474309215688,
101
+ "R": 0.7035164835164835,
102
+ "F1": 0.5839503345959894,
103
+ "t3_s": 6.33277850151062,
104
+ "attempts_total": 11,
105
+ "parse_fail_total": 0,
106
+ "errors_total": 0,
107
+ "calls_total": 11,
108
+ "calls_exhausted": 0
109
+ }
110
+ },
111
+ "avg": {
112
+ "P": 0.5259343845878703,
113
+ "R": 0.7420446220446221,
114
+ "F1": 0.5908911449142732,
115
+ "t3_s": 5.762924075126648,
116
+ "parse_fail_total": 0,
117
+ "errors_total": 0,
118
+ "attempts_total": 22,
119
+ "calls_exhausted": 0
120
+ }
121
+ }
122
+ }
123
+ }
data/runtime_metrics/ui_pipeline_timings.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"timestamp_utc": "2026-03-02T12:44:26Z", "stages_s": {"preprocess": 7.90999984019436e-05, "rewrite": 1.9136111999978311, "structural": 1.0946640000038315, "probe": 0.5859509000001708, "retrieval": 4.595289600001706, "selection": 37.53351300000213, "implication_expansion": 0.15133090000017546, "prompt_composition": 6.299999949987978e-05, "group_display": 0.04701460000069346}, "total_s": 45.927563900004316, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
2
+ {"timestamp_utc": "2026-03-02T16:08:08Z", "stages_s": {"preprocess": 6.989999383222312e-05, "rewrite": 3.0064916999981506, "structural": 4.2000028770416975e-06, "probe": 3.01228209999681, "retrieval": 3.3860946000058902, "selection": 5.285027000005357, "implication_expansion": 0.147530000002007, "prompt_composition": 3.850000211969018e-05, "group_display": 0.10624819999793544}, "total_s": 14.949083599989535, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
3
+ {"timestamp_utc": "2026-03-02T16:08:37Z", "stages_s": {"preprocess": 7.179999374784529e-05, "rewrite": 4.608368299988797, "structural": 3.6999990697950125e-06, "probe": 1.5999976312741637e-06, "retrieval": 3.4574174999870593, "selection": 8.8562099999981, "implication_expansion": 0.14937499999359716, "prompt_composition": 3.650000144261867e-05, "group_display": 0.04632819999824278}, "total_s": 17.122792900001514, "config": {"timeout_rewrite_s": 45.0, "timeout_struct_s": 45.0, "timeout_probe_s": 45.0, "timeout_select_s": 45.0}}
data/structural_tag_definitions.csv CHANGED
@@ -2,18 +2,18 @@ enabled,group_name,constraint,tag,definition
2
  1,character_count,exclusive,zero_pictured,"No characters or living beings are visible in the image."
3
  1,character_count,exclusive,solo,"Exactly one character is visible in the image."
4
  1,character_count,exclusive,duo,"Exactly two characters are visible in the image."
5
- 1,character_count,exclusive,trio,"Exactly three characters are visible in the image; select only when the count is clearly three."
6
  1,character_count,exclusive,group,"Four or more characters are visible in the image; do not use for one, two, or three."
7
- 1,body_type,multi,anthro,"An animal/furry character with BOTH human-like body plan (upright stance, human-like torso/arms/hands) and clear animal traits (fur, muzzle, tail, animal ears, paws, or species cues)."
8
- 1,body_type,multi,feral,"Select only when standard animal anatomy is explicit (for example quadruped, on all fours, non-humanoid animal body). Do not select for upright anthropomorphic characters."
9
- 1,body_type,multi,humanoid,"Select only when human/humanoid/person/man/woman/boy/girl wording is explicit and animal-species traits are not present."
10
  1,body_type,multi,taur,"Select only for an explicit centaur-like body plan: a humanoid upper torso attached to a separate four-legged lower body."
11
- 1,gender,multi,male,"Select only when the description explicitly indicates male presentation or identity, such as male/man/boy/he/him/his/father/husband/boyfriend. 'boy' and male pronouns count as explicit evidence."
12
- 1,gender,multi,female,"Select only when the description explicitly indicates female presentation or identity, such as female/woman/girl/she/her/hers/mother/wife/girlfriend. 'girl' and female pronouns count as explicit evidence."
13
- 1,gender,multi,ambiguous_gender,"Select only when the description explicitly says gender is unknown, ambiguous, androgynous, mixed, or not determinable. Do not use this as a default fallback when gender is simply unmentioned."
14
  1,gender,multi,intersex,"Select only when intersex or mixed-sex-traits wording is explicit in the description."
15
  1,clothing_state,multi,clothed,"At least one character is explicitly described as wearing clothing or a garment (for example shirt, pants, shorts, dress, coat, loincloth, armor, uniform)."
16
- 1,clothing_state,multi,nude,"Select only when the description explicitly indicates no clothing (nude/naked/unclothed). Do not infer nude just because clothing is not mentioned."
17
  1,clothing_state,multi,topless,"The upper body/chest is uncovered while lower body has clothing. This includes descriptions with shorts/pants/loincloth and no shirt/top."
18
  1,clothing_state,multi,bottomless,"The lower body is uncovered while the upper body has clothing."
19
  1,visual_elements,multi,looking_at_viewer,"Select only when explicit gaze wording appears (looking at viewer, looking at camera, looking directly at us, direct eye contact). Do not infer from front view, pose, or expression."
 
2
  1,character_count,exclusive,zero_pictured,"No characters or living beings are visible in the image."
3
  1,character_count,exclusive,solo,"Exactly one character is visible in the image."
4
  1,character_count,exclusive,duo,"Exactly two characters are visible in the image."
5
+ 1,character_count,exclusive,trio,"Exactly three characters are visible in the image; select only when the count is clearly three."
6
  1,character_count,exclusive,group,"Four or more characters are visible in the image; do not use for one, two, or three."
7
+ 1,body_type,multi,anthro,"An animal/furry character with BOTH human-like body plan (upright stance, human-like torso/arms/hands) and clear animal traits (fur, muzzle, tail, animal ears, paws, or species cues)."
8
+ 1,body_type,multi,feral,"A non-humanoid animal body plan (typically quadruped or otherwise animal-shaped, without human-like torso/hands). Do not select if explicitly anthropomorphic."
9
+ 1,body_type,multi,humanoid,"A human or near-human character with no explicit animal-species traits. Do not select if animal species words or animal traits (muzzle, tail, paws, animal ears, heavy fur coat, scales) are present."
10
  1,body_type,multi,taur,"Select only for an explicit centaur-like body plan: a humanoid upper torso attached to a separate four-legged lower body."
11
+ 1,gender,multi,male,"Select only when the description explicitly indicates male presentation or identity, such as male/man/boy/he/him/his/father/husband/boyfriend. 'boy' and male pronouns count as explicit evidence."
12
+ 1,gender,multi,female,"Select only when the description explicitly indicates female presentation or identity, such as female/woman/girl/she/her/hers/mother/wife/girlfriend. 'girl' and female pronouns count as explicit evidence."
13
+ 1,gender,multi,ambiguous_gender,"Select only when the description explicitly says gender is unknown, ambiguous, androgynous, mixed, or not determinable. Do not use this as a default fallback when gender is simply unmentioned."
14
  1,gender,multi,intersex,"Select only when intersex or mixed-sex-traits wording is explicit in the description."
15
  1,clothing_state,multi,clothed,"At least one character is explicitly described as wearing clothing or a garment (for example shirt, pants, shorts, dress, coat, loincloth, armor, uniform)."
16
+ 1,clothing_state,multi,nude,"Select only when the description explicitly indicates no clothing (nude/naked/unclothed). Do not infer nude just because clothing is not mentioned."
17
  1,clothing_state,multi,topless,"The upper body/chest is uncovered while lower body has clothing. This includes descriptions with shorts/pants/loincloth and no shirt/top."
18
  1,clothing_state,multi,bottomless,"The lower body is uncovered while the upper body has clothing."
19
  1,visual_elements,multi,looking_at_viewer,"Select only when explicit gaze wording appears (looking at viewer, looking at camera, looking directly at us, direct eye contact). Do not infer from front view, pose, or expression."
psq_rag/llm/select.py CHANGED
@@ -1,80 +1,59 @@
1
- # psq_rag/llm/select.py
2
- # Stage 3: Closed-Set Selection (LangChain-only implementation)
3
- #
4
- # This module intentionally uses LangChain for:
5
- # - prompt templating (including {N})
6
- # - LLM call orchestration
7
- # - JSON parsing
8
- #
9
- # There is NO fallback path. If LangChain dependencies are missing, this module
10
- # should fail loudly so you install them.
11
-
12
- import os
13
- import re
14
- import csv
15
  from dataclasses import dataclass
16
  from pathlib import Path
17
- from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast, Literal, Mapping
18
-
19
- from langchain_openai import ChatOpenAI
20
- from langchain_core.prompts import ChatPromptTemplate
21
- from langchain_core.output_parsers import PydanticOutputParser
22
- from pydantic import BaseModel, Field, SecretStr
23
- from rapidfuzz import fuzz
24
-
25
- from psq_rag.retrieval.psq_retrieval import Candidate # Candidate(tag, score_*, count, sources)
26
- from psq_rag.retrieval.state import get_tag_type_name, get_tag2aliases
27
-
28
- # Character-typed tags that are generic categories, not actual named characters.
29
- # These leak through the alias filter because they match common words in captions.
30
- # They are excluded from the entity pipeline and instead routed to general selection.
31
- _GENERIC_CHARACTER_TAGS = frozenset({
32
- "fan_character",
33
- "background_character",
34
- "unnamed_character",
35
- "unknown_character",
36
- "anonymous_character",
37
- "viewer",
38
- "original_character",
39
- })
40
-
41
-
42
- WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
43
-
44
- # Ordinal rank: lower = more confident. Used for threshold filtering.
45
- WHY_RANK: Dict[str, int] = {
46
- "explicit": 0,
47
- "strong_implied": 1,
48
- "weak_implied": 2,
49
- "style_or_meta": 3,
50
- "other": 4,
51
- }
52
-
53
- # Deterministic mapping: ordinal "why" -> numeric score for ordering/debug.
54
- WHY_TO_SCORE: Dict[str, float] = {
55
- "explicit": 0.90,
56
- "strong_implied": 0.70,
57
- "weak_implied": 0.45,
58
- "style_or_meta": 0.35,
59
- "other": 0.25,
60
- }
61
-
62
-
63
  # IMPORTANT ABOUT TEMPLATING:
64
  # - This string is rendered by LangChain's f-string template engine.
65
  # - Literal JSON braces must be escaped as {{ and }}.
66
  # - {N} is a real template variable and MUST be provided.
67
  SELECT_SYSTEM_TEMPLATE = """You are given a description of an image and a list of imageboard tags.
68
 
69
- Select the tags that correspond to content that would be visible or depicted in the described image.
70
-
71
- The list contains only valid tags; many of them are irrelevant to the image.
72
 
73
  Return JSON ONLY matching this schema:
74
 
75
  {{
76
  \"selections\": [
77
- {{\"i\": <int>, \"why\": \"<one of: explicit|strong_implied|weak_implied|style_or_meta|other>\"}},
78
  ...
79
  ]
80
  }}
@@ -82,34 +61,27 @@ Return JSON ONLY matching this schema:
82
  Rules:
83
  - Choose ONLY from indices 1..{N}.
84
  - Do NOT output tag text.
85
- - Do NOT output any keys other than \"selections\", and inside each item only the item index \"i\" and \"why\".
86
  - Do select both a general tag and a more specific tag when both apply (for example, \"shirt\" and \"grey shirt\").
87
-
88
- Define \"why\" as:
89
- - explicit: directly stated in the image description
90
- - strong_implied: very likely given the description, even if not literally stated
91
- - weak_implied: plausible but not strongly supported by the description
92
- - style_or_meta: stylistic or presentation-related tags only if clearly indicated
93
- - other: fallback category; use sparingly
94
  """
95
-
96
-
97
- def _get_select_system_template() -> str:
98
- """Return Stage 3 selection prompt text."""
99
- return SELECT_SYSTEM_TEMPLATE
100
-
101
-
102
  ENTITY_SYSTEM_TEMPLATE = """You are given a description of an image and a list of CHARACTER tags.
103
-
104
- These character tags have already been pre-filtered to only include characters whose names
105
- (or known aliases) appear in the image description. Your job is to confirm which of these
106
- pre-filtered candidates are the correct match for the character mentioned by the user.
107
-
108
  Return JSON ONLY matching this schema:
109
 
110
  {{
111
  \"selections\": [
112
- {{\"i\": <int>, \"why\": \"explicit\"}},
113
  ...
114
  ]
115
  }}
@@ -117,213 +89,205 @@ Return JSON ONLY matching this schema:
117
  Rules for character selection:
118
  - Choose ONLY from indices 1..{N}.
119
  - Do NOT output tag text.
120
- - Always use \"why\": \"explicit\" for all selections.
121
  - Select the tag that best represents the character as described.
122
  - If the user described a specific variant (e.g. \"pikachu libre\", \"detective pikachu\"),
123
  select that specific variant tag.
124
- - If the user described only the base character (e.g. just \"pikachu\"), select only the
125
- base/default tag, NOT costume or variant tags.
126
- - When uncertain between variants, prefer the simplest/most general tag.
127
- """
128
-
129
-
130
- USER_TEMPLATE = """IMAGE DESCRIPTION:
131
- {image_description}
132
-
133
- CANDIDATES (choose by index only):
134
- {candidate_lines}
135
-
136
- Select up to {per_call_budget} indices. Output fewer if uncertain.
137
- """
138
-
139
-
140
  @dataclass(frozen=True)
141
  class Selected:
142
  i: int
143
  tag: str # canonical tag (underscore form)
144
- why: str
145
- score: float
146
-
147
-
148
- WhyLiteral = Literal["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
149
 
150
 
151
  class Stage3SelectionItem(BaseModel):
152
  i: int = Field(..., description="1-based index into the candidate list.")
153
- why: WhyLiteral = Field(..., description="Rationale code from the allowed set.")
154
 
155
 
156
  class Stage3SelectionResponse(BaseModel):
157
- selections: List[Stage3SelectionItem] = Field(default_factory=list)
158
-
159
-
160
- def _build_response_format() -> Dict[str, Any]:
161
- # Strict JSON Schema structured output.
162
- schema = {
163
- "type": "object",
164
- "properties": {
165
- "selections": {
166
- "type": "array",
167
  "items": {
168
  "type": "object",
169
  "properties": {
170
  "i": {"type": "integer"},
171
- "why": {"type": "string", "enum": WHY_ENUM},
172
  },
173
- "required": ["i", "why"],
174
  "additionalProperties": False,
175
  },
176
  }
177
- },
178
- "required": ["selections"],
179
- "additionalProperties": False,
180
- }
181
-
182
- return {
183
- "type": "json_schema",
184
- "json_schema": {
185
- "name": "stage3_selection",
186
- "strict": True,
187
- "schema": schema,
188
- },
189
- }
190
-
191
-
192
- def _get_llm(*, temperature: float, max_tokens: int, response_format: Dict[str, Any]) -> ChatOpenAI:
193
- api_key = os.getenv("OPENROUTER_API_KEY")
194
- if not api_key:
195
- raise RuntimeError(
196
- "OPENROUTER_API_KEY is not set.\n"
197
- "Set it in your environment before running Stage 3."
198
- )
199
- api_key = SecretStr(cast(str, api_key))
200
-
201
- model = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
202
- headers: Dict[str, str] = {}
203
- if referer := os.getenv("OPENROUTER_HTTP_REFERER"):
204
- headers["HTTP-Referer"] = referer
205
- if title := os.getenv("OPENROUTER_X_TITLE"):
206
- headers["X-Title"] = title
207
-
208
- # OpenRouter OpenAI-compatible endpoint.
209
- return ChatOpenAI(
210
- model=model,
211
- base_url="https://openrouter.ai/api/v1",
212
- api_key=api_key,
213
- temperature=temperature,
214
- max_completion_tokens=max_tokens,
215
- default_headers=headers,
216
- # Provider-specific request body fields (OpenAI-compatible).
217
- # Response Healing plugin reduces malformed-JSON failures (syntax only).
218
- extra_body={
219
- "response_format": response_format,
220
- "plugins": [{"id": "response-healing"}],
221
- },
222
- )
223
-
224
-
225
- def _phrase_key_for_candidate(c: Candidate) -> str:
226
- # Deterministic "primary phrase" for grouping.
227
- if c.sources:
228
- return sorted(c.sources)[0]
229
- return ""
230
-
231
-
232
- def _interleave_round_robin(cands: Sequence[Candidate]) -> List[Candidate]:
233
- """Round-robin interleave by primary source phrase.
234
-
235
- NOTE: counts are used only for ordering; they are NOT shown to the LLM.
236
- """
237
- groups: Dict[str, List[Candidate]] = {}
238
- for c in cands:
239
- k = _phrase_key_for_candidate(c)
240
- groups.setdefault(k, []).append(c)
241
-
242
- for k in groups:
243
- groups[k].sort(key=lambda x: (x.score_combined, (x.count or -1)), reverse=True)
244
-
245
- keys = sorted(groups.keys())
246
-
247
- out: List[Candidate] = []
248
- idx = 0
249
- while True:
250
- progressed = False
251
- for k in keys:
252
- if idx < len(groups[k]):
253
- out.append(groups[k][idx])
254
- progressed = True
255
- if not progressed:
256
- break
257
- idx += 1
258
-
259
- return out
260
-
261
-
262
- def _build_chunks(cands: Sequence[Candidate], chunk_size: int) -> List[List[Candidate]]:
263
- if chunk_size <= 0:
264
- raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
265
- ordered = _interleave_round_robin(cands)
266
- return [ordered[i:i + chunk_size] for i in range(0, len(ordered), chunk_size)]
267
-
268
-
269
- def _display_tag(tag: str) -> str:
270
- # Display tags with spaces for the LLM, but keep canonical underscores internally.
271
- return tag.replace("_", " ")
272
-
273
-
274
- def _format_candidates_local(
275
- cands: Sequence[Candidate],
276
- candidate_display: Optional[Mapping[str, str]] = None,
277
- ) -> Tuple[str, Dict[int, str], Dict[int, Candidate]]:
278
- lines: List[str] = []
279
- idx_to_tag: Dict[int, str] = {}
280
- idx_to_candidate: Dict[int, Candidate] = {}
281
- for j, c in enumerate(cands, start=1):
282
- idx_to_tag[j] = c.tag
283
- idx_to_candidate[j] = c
284
- display = candidate_display.get(c.tag) if candidate_display else None
285
- if not display:
286
- display = _display_tag(c.tag)
287
- lines.append(f"{j}. {display}")
288
- return "\n".join(lines), idx_to_tag, idx_to_candidate
289
-
290
-
291
- def _phrases_in_call(cands: Sequence[Candidate]) -> int:
292
- s = set()
293
- for c in cands:
294
- for src in c.sources:
295
- s.add(src)
296
- return len(s)
297
-
298
-
299
  def _parse_validate_map(
300
  parsed: Any,
301
  idx_to_tag: Dict[int, str],
302
  per_call_budget: int,
303
- ) -> Tuple[List[Selected], Dict[str, Any]]:
304
- diag = {
305
- "parse_ok": isinstance(parsed, dict),
306
- "invalid_items": 0,
307
- "oob_indices": 0,
308
- "dupe_indices": 0,
309
- "kept": 0,
310
- }
311
-
312
- if isinstance(parsed, BaseModel):
313
- parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
314
- diag["parse_ok"] = isinstance(parsed, dict)
315
-
316
- if not isinstance(parsed, dict):
317
- return [], diag
318
-
319
- selections = parsed.get("selections", [])
320
- if not isinstance(selections, list):
321
- diag["parse_ok"] = False
322
- return [], diag
323
-
324
- out: List[Selected] = []
325
- seen_i = set()
326
-
327
  for item in selections:
328
  if len(out) >= per_call_budget:
329
  break
@@ -332,7 +296,6 @@ def _parse_validate_map(
332
  continue
333
 
334
  i = item.get("i")
335
- why = item.get("why")
336
 
337
  if isinstance(i, bool) or not isinstance(i, int):
338
  diag["invalid_items"] += 1
@@ -343,1057 +306,1046 @@ def _parse_validate_map(
343
  if i not in idx_to_tag:
344
  diag["oob_indices"] += 1
345
  continue
346
- if not isinstance(why, str) or why not in WHY_ENUM:
 
347
  diag["invalid_items"] += 1
348
  continue
349
  seen_i.add(i)
350
  tag = idx_to_tag[i]
351
- out.append(Selected(i=i, tag=tag, why=why, score=WHY_TO_SCORE[why]))
352
-
353
- diag["kept"] = len(out)
354
- return out, diag
355
-
356
-
357
- def _split_candidates_by_type(
358
- candidates: List[Candidate],
359
- log,
360
- ) -> Tuple[List[Tuple[int, Candidate]], List[Tuple[int, Candidate]]]:
361
- """Split candidates into general vs entity (character only) lists.
362
-
363
- Returns:
364
- (general_list, entity_list) where each item is (original_index, candidate)
365
-
366
- Tag types:
367
- - General: 0 (general), 1 (artist), 5 (species), 7 (meta)
368
- - Entity: 4 (character) only
369
- - Filtered: 3 (copyright) - too broad for image generation
370
- """
371
- general_with_idx: List[Tuple[int, Candidate]] = []
372
- entity_with_idx: List[Tuple[int, Candidate]] = []
373
-
374
- unknown_count = 0
375
- copyright_count = 0
376
-
377
- generic_char_count = 0
378
-
379
- for idx, cand in enumerate(candidates):
380
- type_name = get_tag_type_name(cand.tag)
381
-
382
- if type_name == "character":
383
- if cand.tag in _GENERIC_CHARACTER_TAGS:
384
- # Route generic character-category tags to general selection
385
- general_with_idx.append((idx, cand))
386
- generic_char_count += 1
387
- else:
388
- entity_with_idx.append((idx, cand))
389
- elif type_name == "copyright":
390
- # Filter out copyright/series tags - too broad for image generation
391
- copyright_count += 1
392
- elif type_name in ("general", "artist", "species", "meta"):
393
- general_with_idx.append((idx, cand))
394
- else:
395
- # Unknown or None - treat as general by default
396
- general_with_idx.append((idx, cand))
397
- unknown_count += 1
398
-
399
- if log:
400
- log(
401
- f"Stage3 split: "
402
- f"general={len(general_with_idx)} "
403
- f"entity={len(entity_with_idx)} "
404
- f"copyright_filtered={copyright_count} "
405
- f"generic_char_to_general={generic_char_count} "
406
- f"unknown_type={unknown_count}"
407
- )
408
-
409
- return general_with_idx, entity_with_idx
410
-
411
-
412
- # Regex to strip series/franchise suffixes from aliases, e.g. _(sonic), _(mlp), _(character)
413
- _SERIES_SUFFIX_RE = re.compile(r"_\([^)]+\)$")
414
-
415
-
416
- def _normalize_for_matching(text: str) -> str:
417
- """Lowercase, replace underscores with spaces, strip series suffixes."""
418
- text = text.lower().strip()
419
- text = _SERIES_SUFFIX_RE.sub("", text)
420
- text = text.replace("_", " ")
421
- return text
422
-
423
-
424
- def _query_words(query: str) -> Set[str]:
425
- """Extract individual words from the user query for matching."""
426
- return set(_normalize_for_matching(query).split())
427
-
428
-
429
- def _alias_matches_query(alias_norm: str, query_words: Set[str], query_norm: str,
430
- fuzzy_threshold: int = 85) -> bool:
431
- """Check if an alias matches the user query.
432
-
433
- Matching logic:
434
- 1. Exact substring: alias appears as a substring of the query
435
- 2. Word subset: all words in the alias appear in the query words
436
- 3. Fuzzy: alias is close to a word in the query (handles typos)
437
- """
438
- # Exact substring match
439
- if alias_norm in query_norm:
440
- return True
441
-
442
- alias_words = alias_norm.split()
443
- if not alias_words:
444
- return False
445
-
446
- # Word subset match: all alias words must appear in query
447
- if all(w in query_words for w in alias_words):
448
- return True
449
-
450
- # For single-word aliases, try fuzzy matching against each query word
451
- if len(alias_words) == 1:
452
- for qw in query_words:
453
- if fuzz.ratio(alias_words[0], qw) >= fuzzy_threshold:
454
- return True
455
-
456
- # For multi-word aliases, try fuzzy partial ratio against whole query
457
- if len(alias_words) > 1:
458
- if fuzz.partial_ratio(alias_norm, query_norm) >= fuzzy_threshold:
459
- return True
460
-
461
- return False
462
-
463
-
464
- def _character_matches_via_aliases(
465
- tag: str,
466
- query: str,
467
- tag2aliases: Dict[str, List[str]],
468
- query_words: Set[str],
469
- query_norm: str,
470
- fuzzy_threshold: int = 85,
471
- ) -> bool:
472
- """Check if a character tag matches the user query via its aliases.
473
-
474
- For a character tag to match:
475
- - The tag name itself (normalized) must match, OR
476
- - At least one of its registered aliases must match.
477
-
478
- Empty aliases list means no known aliases; still check the tag name itself.
479
- """
480
- # Check the tag name itself
481
- tag_norm = _normalize_for_matching(tag)
482
- if _alias_matches_query(tag_norm, query_words, query_norm, fuzzy_threshold):
483
- return True
484
-
485
- # Check all registered aliases
486
- aliases = tag2aliases.get(tag, [])
487
- for alias in aliases:
488
- alias_norm = _normalize_for_matching(alias)
489
- if not alias_norm:
490
- continue
491
- if _alias_matches_query(alias_norm, query_words, query_norm, fuzzy_threshold):
492
- return True
493
-
494
- return False
495
-
496
-
497
  def llm_select_indices(
498
- query_text: str, # kept for compatibility; treated as IMAGE DESCRIPTION
499
- candidates: Union[
500
- Sequence[Candidate],
501
- Sequence[str],
502
- Sequence[Tuple[str, float]],
503
- ],
504
- max_pick: int, # legacy param; applied after union + ordering (optional)
505
- log,
506
- retries: int = 2,
507
- *,
508
- mode: str = "chunked_map_union", # "single_shot" or "chunked_map_union"
509
- chunk_size: int = 60,
510
- per_phrase_k: int = 2, # per-call budget = per_phrase_k * phrases_in_call
511
- temperature: float = 0.0,
512
- max_tokens: int = 512,
513
- return_metadata: bool = False,
514
- return_diagnostics: bool = False,
515
- min_why: Optional[str] = "strong_implied",
516
- candidate_display: Optional[Mapping[str, str]] = None,
517
- ) -> Union[
518
- List[int],
519
- Tuple[List[int], Dict[str, str]],
520
- Tuple[List[int], Dict[str, str], Dict[str, Any]],
521
- ]:
522
- """Return indices into the ORIGINAL candidates list (legacy interface).
523
-
524
- min_why: if set, only keep tags whose 'why' is at or above this confidence
525
- level. E.g. min_why="explicit" keeps only explicit matches;
526
- min_why="strong_implied" keeps explicit + strong_implied.
527
- Default: "strong_implied".
528
-
529
- This implementation uses LangChain ONLY.
530
-
531
- NOTE: query_text is treated as the image description (original prompt).
532
- """
533
-
534
- image_description = query_text
535
-
536
- # Normalize candidates:
537
- # - preferred: List[Candidate]
538
- # - legacy: List[(tag, sim)] (count/sources unavailable)
539
- norm: List[Candidate] = []
540
- tag_to_first_index: Dict[str, int] = {}
541
-
542
- branch = "empty"
543
- cand0_type = type(candidates[0]).__name__ if candidates else "none"
544
-
545
- if candidates and isinstance(candidates[0], Candidate):
546
- branch = "candidate"
547
- typed_candidates = cast(Sequence[Candidate], candidates)
548
- for idx, c in enumerate(typed_candidates):
549
- if c.tag not in tag_to_first_index:
550
- tag_to_first_index[c.tag] = idx
551
- norm.append(c)
552
- elif candidates and isinstance(candidates[0], str):
553
- branch = "string"
554
- typed_candidates = cast(Sequence[str], candidates)
555
- for idx, tag in enumerate(typed_candidates):
556
- if tag not in tag_to_first_index:
557
- tag_to_first_index[tag] = idx
558
- norm.append(
559
- Candidate(
560
- tag=tag,
561
- score_combined=0.0,
562
- score_fasttext=None,
563
- score_context=None,
564
- count=None,
565
- sources=[],
566
- )
567
- )
568
- else:
569
- if candidates:
570
- branch = "tuple"
571
- typed_candidates = cast(Sequence[Tuple[str, float]], candidates)
572
- for idx, row in enumerate(typed_candidates):
573
- if not isinstance(row, (list, tuple)) or len(row) < 2:
574
- raise ValueError("Stage 3 candidates must be Candidate, tag strings, or (tag, score) tuples.")
575
- tag, sim = row[0], row[1]
576
- if tag not in tag_to_first_index:
577
- tag_to_first_index[tag] = idx
578
- norm.append(
579
- Candidate(
580
- tag=tag,
581
- score_combined=float(sim),
582
- score_fasttext=None,
583
- score_context=None,
584
- count=None,
585
- sources=[],
586
- )
587
- )
588
-
589
- if log:
590
- if norm:
591
- log(
592
- "Stage3 input: "
593
- f"type0={cand0_type} "
594
- f"branch={branch} "
595
- f"norm0_score={norm[0].score_combined!r} "
596
- f"norm0_sources_empty={not bool(norm[0].sources)}"
597
- )
598
- else:
599
- log(f"Stage3 input: type0={cand0_type} branch={branch} (no candidates)")
600
-
601
- if mode not in ("single_shot", "chunked_map_union"):
602
- raise ValueError(f"Invalid mode: {mode}")
603
-
604
- response_format = _build_response_format()
605
- llm = _get_llm(temperature=temperature, max_tokens=max_tokens, response_format=response_format)
606
- model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
607
-
608
- parser = PydanticOutputParser(pydantic_object=Stage3SelectionResponse)
609
- select_system_template = _get_select_system_template()
610
-
611
- # Global union: tag -> best (score, why)
612
- best: Dict[str, Tuple[float, str]] = {}
613
- diagnostics: Dict[str, Any] = {
614
- "mode": mode,
615
- "chunk_strategy": "interleave",
616
- "chunk_passes": 1,
617
- "chunk_shuffle_within_call": False,
618
- "calls_total": 0,
619
- "calls_with_selection": 0,
620
- "calls_exhausted_retries": 0,
621
- "attempts_total": 0,
622
- "attempt_errors": 0,
623
- "attempt_parse_fail": 0,
624
- "attempt_parse_ok": 0,
625
- "invalid_items_total": 0,
626
- "oob_indices_total": 0,
627
- "dupe_indices_total": 0,
628
- "kept_total": 0,
629
- "attempts_by_n_local": {},
630
- }
631
-
632
- def _record_attempt_for_n(n_local: int, *, parse_ok: bool, error: bool) -> None:
633
- by_n = diagnostics["attempts_by_n_local"]
634
- key = str(n_local)
635
- if key not in by_n:
636
- by_n[key] = {
637
- "attempts": 0,
638
- "parse_ok": 0,
639
- "parse_fail": 0,
640
- "errors": 0,
641
- }
642
- by_n[key]["attempts"] += 1
643
- if error:
644
- by_n[key]["errors"] += 1
645
- elif parse_ok:
646
- by_n[key]["parse_ok"] += 1
647
- else:
648
- by_n[key]["parse_fail"] += 1
649
-
650
- def run_call(call_cands: Sequence[Candidate], label: str, system_template: str) -> None:
651
- # Create chain with the provided system template
652
- prompt = ChatPromptTemplate.from_messages(
653
- [
654
- ("system", system_template),
655
- ("human", USER_TEMPLATE),
656
- ],
657
- template_format="f-string",
658
- )
659
- chain = prompt | llm | parser
660
-
661
- ordered = _interleave_round_robin(call_cands) if mode == "single_shot" else list(call_cands)
662
- candidate_lines, idx_to_tag, idx_to_candidate = _format_candidates_local(
663
- ordered,
664
- candidate_display=candidate_display,
665
- )
666
- N_local = len(idx_to_tag)
667
- diagnostics["calls_total"] += 1
668
-
669
- phrases = _phrases_in_call(call_cands)
670
- per_call_budget = max(1, per_phrase_k * phrases) if phrases > 0 else per_phrase_k
671
- summary_logged = False
672
-
673
- if log:
674
- log(f"Stage3 {label}: candidates (local indices):\n{candidate_lines}")
675
- if phrases > 0:
676
- distinct_phrases = sorted({src for c in call_cands for src in c.sources})
677
- log(
678
- f"Stage3 {label}: distinct_phrases={len(distinct_phrases)} "
679
- f"phrases={', '.join(distinct_phrases)}"
680
- )
681
-
682
- # Invoke LangChain chain (templating fills {N} and other vars)
683
- for att in range(retries + 1):
684
- try:
685
- diagnostics["attempts_total"] += 1
686
- if log:
687
- log(
688
- f"Stage3 {label}: "
689
- f"model={model_name} "
690
- f"N={N_local} "
691
- f"phrases={phrases} "
692
- f"per_call_budget={per_call_budget} "
693
- f"response_healing=on"
694
- )
695
-
696
- parsed = chain.invoke(
697
- {
698
- "N": N_local,
699
- "image_description": image_description,
700
- "candidate_lines": candidate_lines,
701
- "per_call_budget": per_call_budget,
702
- }
703
- )
704
- selected, diag = _parse_validate_map(parsed, idx_to_tag, per_call_budget=per_call_budget)
705
- diagnostics["invalid_items_total"] += int(diag.get("invalid_items", 0))
706
- diagnostics["oob_indices_total"] += int(diag.get("oob_indices", 0))
707
- diagnostics["dupe_indices_total"] += int(diag.get("dupe_indices", 0))
708
- diagnostics["kept_total"] += int(diag.get("kept", 0))
709
- if bool(diag.get("parse_ok", False)):
710
- diagnostics["attempt_parse_ok"] += 1
711
- _record_attempt_for_n(N_local, parse_ok=True, error=False)
712
- else:
713
- diagnostics["attempt_parse_fail"] += 1
714
- _record_attempt_for_n(N_local, parse_ok=False, error=False)
715
- if log:
716
- log(f"Stage3 {label}: attempt {att+1} diag={diag}")
717
- if not summary_logged and (selected or att == retries):
718
- log(
719
- f"Stage3 {label}: summary "
720
- f"N={N_local} selected={len(selected)} per_call_budget={per_call_budget}"
721
- )
722
- summary_logged = True
723
  if selected:
724
  lines = [
725
  f"Stage3 {label} selections:",
726
  *[
727
  (
728
  f' - i={s.i} tag="{s.tag}" '
729
- f"why={s.why} score={s.score:.2f} "
730
  f"sources={idx_to_candidate.get(s.i).sources if idx_to_candidate.get(s.i) else []}"
731
  )
732
  for s in selected
733
  ],
734
  ]
735
- log("\n".join(lines))
736
- else:
737
- log(f"Stage3 {label} selections: (none)")
738
-
739
  if selected:
740
  diagnostics["calls_with_selection"] += 1
741
  for s in selected:
742
- prev = best.get(s.tag)
743
- if prev is None or s.score > prev[0]:
744
- best[s.tag] = (s.score, s.why)
745
  return
746
-
747
- except Exception as e:
748
- diagnostics["attempt_errors"] += 1
749
- _record_attempt_for_n(N_local, parse_ok=False, error=True)
750
- if log:
751
- log(f"Stage3 {label}: attempt {att+1} error: {e}")
752
-
753
- if log:
754
- log(f"Stage3 {label}: gave up after {retries+1} attempts")
755
- diagnostics["calls_exhausted_retries"] += 1
756
-
757
- # Split candidates by type (general vs entity)
758
- general_with_idx, entity_with_idx = _split_candidates_by_type(norm, log)
759
-
760
- # Extract just the candidates for LLM calls
761
- general_cands = [cand for _, cand in general_with_idx]
762
- entity_cands = [cand for _, cand in entity_with_idx]
763
-
764
- # Process general candidates (attributes, actions, species, etc.)
765
- if general_cands:
766
- if mode == "single_shot":
767
- run_call(general_cands, "general_single_shot", select_system_template)
768
- else:
769
- base_chunks = _build_chunks(general_cands, chunk_size)
770
- for chunk_idx, chunk in enumerate(base_chunks):
771
- run_call(chunk, f"general_chunk_{chunk_idx}", select_system_template)
772
-
773
- # Process entity candidates (characters only) with alias-based pre-filtering
774
- if entity_cands:
775
- tag2aliases = get_tag2aliases()
776
- qwords = _query_words(image_description)
777
- qnorm = _normalize_for_matching(image_description)
778
-
779
- filtered_entity_cands: List[Candidate] = []
780
- filtered_out: List[str] = []
781
-
782
- for cand in entity_cands:
783
- if _character_matches_via_aliases(
784
- cand.tag, image_description, tag2aliases, qwords, qnorm
785
- ):
786
- filtered_entity_cands.append(cand)
787
- else:
788
- filtered_out.append(cand.tag)
789
-
790
- if log:
791
- log(
792
- f"Stage3 entity alias filter: "
793
- f"before={len(entity_cands)} "
794
- f"after={len(filtered_entity_cands)} "
795
- f"removed={len(filtered_out)}"
796
- )
797
- if filtered_out:
798
- log(f"Stage3 entity alias filter removed: {filtered_out[:20]}")
799
-
800
- if filtered_entity_cands:
801
- if mode == "single_shot":
802
- run_call(filtered_entity_cands, "entity_single_shot", ENTITY_SYSTEM_TEMPLATE)
803
- else:
804
- base_chunks = _build_chunks(filtered_entity_cands, chunk_size)
805
- for chunk_idx, chunk in enumerate(base_chunks):
806
- run_call(chunk, f"entity_chunk_{chunk_idx}", ENTITY_SYSTEM_TEMPLATE)
807
-
808
- # Apply why threshold: drop tags below the minimum confidence level.
809
- if min_why is not None:
810
- max_rank = WHY_RANK.get(min_why, 4)
811
- before = len(best)
812
- best = {t: v for t, v in best.items() if WHY_RANK.get(v[1], 4) <= max_rank}
813
- if log:
814
- log(f"Stage3 why filter: min_why={min_why} (rank<={max_rank}), "
815
- f"before={before} after={len(best)} dropped={before - len(best)}")
816
-
817
- # Deterministic ordering: derived score desc, tie-break by count desc (count not shown to LLM).
818
  count_by_tag = {c.tag: (c.count if c.count is not None else -1) for c in norm}
819
- ordered_tags = sorted(best.keys(), key=lambda t: (best[t][0], count_by_tag.get(t, -1)), reverse=True)
820
-
821
- # Legacy cap: apply AFTER union + ordering.
822
- if isinstance(max_pick, int) and max_pick > 0:
823
- ordered_tags = ordered_tags[:max_pick]
824
-
825
- # Map back to original indices
826
- out_idx: List[int] = []
827
  tag_why: Dict[str, str] = {}
828
  for t in ordered_tags:
829
  if t in tag_to_first_index:
830
  out_idx.append(tag_to_first_index[t])
831
- tag_why[t] = best[t][1] # why string
832
-
833
- if diagnostics["attempts_total"] > 0:
834
- diagnostics["attempt_failure_rate"] = (
835
- diagnostics["attempt_parse_fail"] + diagnostics["attempt_errors"]
836
- ) / diagnostics["attempts_total"]
837
- else:
838
- diagnostics["attempt_failure_rate"] = 0.0
839
-
840
- if diagnostics["calls_total"] > 0:
841
- diagnostics["call_exhaustion_rate"] = (
842
- diagnostics["calls_exhausted_retries"] / diagnostics["calls_total"]
843
- )
844
- else:
845
- diagnostics["call_exhaustion_rate"] = 0.0
846
-
847
- if return_metadata:
848
- if return_diagnostics:
849
- return out_idx, tag_why, diagnostics
850
- return out_idx, tag_why
851
-
852
- return out_idx
853
-
854
-
855
- # ---------------------------------------------------------------------------
856
- # Stage 3s: Structural tag inference (solo/duo/male/female/anthro/… )
857
- # ---------------------------------------------------------------------------
858
- # Group-based approach: tags are organized into semantic groups loaded from
859
- # tag_groups.json / tag_wiki_defs.json where possible, with curated fallback
860
- # definitions for tags whose wiki entries are only thumbnail references.
861
- #
862
- # Each group specifies a constraint mode:
863
- # "exclusive" = pick exactly one (e.g. character count)
864
- # "multi" = pick all that apply (e.g. body type, gender)
865
-
866
- import json as _json
867
-
868
- @dataclass
869
- class StructuralGroup:
870
- """One category of structural tags to probe."""
871
- name: str
872
- constraint: str # "exclusive" or "multi"
873
- tags: List[Tuple[str, str]] # (tag, definition) pairs
874
-
875
-
876
- def _load_structural_groups_from_csv() -> List[StructuralGroup]:
877
- """Load structural groups from data/structural_tag_definitions.csv."""
878
- data_dir = Path(__file__).resolve().parents[2] / "data"
879
- csv_path = data_dir / "structural_tag_definitions.csv"
880
- if not csv_path.is_file():
881
- return []
882
-
883
- groups_by_name: Dict[str, List[Tuple[str, str]]] = {}
884
- constraints_by_name: Dict[str, str] = {}
885
-
886
- with csv_path.open("r", encoding="utf-8", newline="") as f:
887
- reader = csv.DictReader(f)
888
- for row in reader:
889
- enabled = (row.get("enabled") or "1").strip().lower()
890
- if enabled in {"0", "false", "no"}:
891
- continue
892
-
893
- group_name = (row.get("group_name") or "").strip()
894
- constraint = (row.get("constraint") or "multi").strip().lower()
895
- tag = (row.get("tag") or "").strip()
896
- definition = " ".join((row.get("definition") or "").split())
897
-
898
- if not group_name or not tag or not definition:
899
- continue
900
- if constraint not in {"exclusive", "multi"}:
901
- constraint = "multi"
902
-
903
- if group_name not in groups_by_name:
904
- groups_by_name[group_name] = []
905
- constraints_by_name[group_name] = constraint
906
- groups_by_name[group_name].append((tag, definition))
907
-
908
- out: List[StructuralGroup] = []
909
- for group_name, tags in groups_by_name.items():
910
- if not tags:
911
- continue
912
- out.append(
913
- StructuralGroup(
914
- name=group_name,
915
- constraint=constraints_by_name.get(group_name, "multi"),
916
- tags=tags,
917
- )
918
- )
919
- return out
920
-
921
- def _load_structural_groups() -> List[StructuralGroup]:
922
- """Build structural groups from local config file with legacy fallback.
923
-
924
- Preferred source:
925
- data/structural_tag_definitions.csv
926
- Fallback:
927
- tag_wiki_defs.json + curated hardcoded defaults
928
- """
929
- csv_groups = _load_structural_groups_from_csv()
930
- if csv_groups:
931
- return csv_groups
932
-
933
- data_dir = Path(__file__).resolve().parents[2] / "data"
934
-
935
- # Load wiki definitions (may not exist yet)
936
- wiki_defs: Dict[str, str] = {}
937
- wiki_path = data_dir / "tag_wiki_defs.json"
938
- if wiki_path.is_file():
939
- with wiki_path.open("r", encoding="utf-8") as f:
940
- wiki_defs = _json.load(f)
941
-
942
- def _def(tag: str, fallback: str) -> str:
943
- """Get wiki definition if it's real text, otherwise use fallback."""
944
- d = wiki_defs.get(tag, "")
945
- # Skip thumbnail-only definitions
946
- if not d or d.startswith("thumb ") or len(d) < 15:
947
- return fallback
948
- return d[:200] # cap length for prompt
949
-
950
- groups: List[StructuralGroup] = []
951
-
952
- # ── Group A: Character Count (exclusive) ──
953
- groups.append(StructuralGroup(
954
- name="character_count",
955
- constraint="exclusive",
956
- tags=[
957
- ("zero_pictured", _def("zero_pictured",
958
- "No characters or living beings appear in the image")),
959
- ("solo", _def("solo",
960
- "Exactly one character appears in the image")),
961
- ("duo", _def("duo",
962
- "Exactly two characters appear in the image")),
963
- ("trio", _def("trio",
964
- "Exactly three characters appear in the image")),
965
- ("group", _def("group",
966
- "Four or more characters appear in the image")),
967
- ],
968
- ))
969
-
970
- # ── Group B: Body Type (multi — per character) ──
971
- # Key distinction the LLM must learn:
972
- # anthro = ANIMAL with human body shape (upright, hands)
973
- # humanoid = HUMAN or near-human (elf, dwarf) with NO animal features
974
- # feral = normal animal shape, on all fours
975
- groups.append(StructuralGroup(
976
- name="body_type",
977
- constraint="multi",
978
- tags=[
979
- ("anthro", _def("anthro",
980
- "An animal character with a human-like body: walks upright on two legs, "
981
- "has arms and hands. Examples: a wolf-person, a fox standing up. "
982
- "Still has animal features like fur, tail, muzzle")),
983
- ("feral", _def("feral",
984
- "A regular animal in its natural body shape. Walks on all fours (or "
985
- "flies/swims naturally). NOT standing upright, NOT humanized")),
986
- ("humanoid", _def("humanoid",
987
- "A human or human-like character with NO animal features. Includes "
988
- "humans, elves, dwarves, and fantasy races that look human. "
989
- "Does NOT include animal-people — those are anthro")),
990
- ("taur", _def("taur",
991
- "A centaur-like body: human or anthro upper body attached to a "
992
- "four-legged animal lower body")),
993
- ],
994
- ))
995
-
996
- # ── Group C: Gender (multi — per character) ──
997
- groups.append(StructuralGroup(
998
- name="gender",
999
- constraint="multi",
1000
- tags=[
1001
- ("male", _def("male",
1002
- "A character described as male, a boy, or with he/him pronouns")),
1003
- ("female", _def("female",
1004
- "A character described as female, a girl, or with she/her pronouns")),
1005
- ("ambiguous_gender", _def("ambiguous_gender",
1006
- "A character whose gender is not stated or cannot be determined")),
1007
- ("intersex", _def("intersex",
1008
- "A character explicitly described as intersex or hermaphrodite")),
1009
- ],
1010
- ))
1011
-
1012
- # ── Group D: Clothing State (multi) ──
1013
- groups.append(StructuralGroup(
1014
- name="clothing_state",
1015
- constraint="multi",
1016
- tags=[
1017
- ("clothed", _def("clothed",
1018
- "Wearing clothes on BOTH chest/torso AND legs/waist. "
1019
- "Examples: shirt and pants, dress, full outfit")),
1020
- ("nude", _def("nude",
1021
- "Wearing NO clothes at all. Completely naked, no shirt and no pants")),
1022
- ("topless", _def("topless",
1023
- "NO shirt/top (bare chest), BUT wearing pants/bottoms. "
1024
- "Upper body exposed, lower body covered")),
1025
- ("bottomless", _def("bottomless",
1026
- "Wearing shirt/top on chest, BUT NO pants/bottoms. "
1027
- "Upper body covered, lower body exposed")),
1028
- ],
1029
- ))
1030
-
1031
- # ── Group E: Common Visual Elements (multi) ──
1032
- groups.append(StructuralGroup(
1033
- name="visual_elements",
1034
- constraint="multi",
1035
- tags=[
1036
- ("looking_at_viewer", _def("looking_at_viewer",
1037
- "A character is looking directly at the camera or viewer")),
1038
- ("text", _def("text",
1039
- "The image contains visible writing, words, or lettering")),
1040
- ],
1041
- ))
1042
-
1043
- return groups
1044
-
1045
-
1046
- def _build_structural_prompt(groups: List[StructuralGroup]) -> Tuple[str, List[Tuple[str, str]]]:
1047
- """Build numbered statement list from structural groups.
1048
-
1049
- Returns (formatted_text, flat_list_of_(tag, definition)_pairs).
1050
- The flat list maps 1-based statement numbers to tags.
1051
- """
1052
- lines: List[str] = []
1053
- flat: List[Tuple[str, str]] = []
1054
- idx = 1
1055
-
1056
- for g in groups:
1057
- constraint_label = "pick EXACTLY ONE" if g.constraint == "exclusive" else "pick ALL that apply"
1058
- group_header = f"--- {g.name.replace('_', ' ').upper()} ({constraint_label}) ---"
1059
- lines.append(group_header)
1060
- for tag, defn in g.tags:
1061
- lines.append(f"{idx}. {defn}")
1062
- flat.append((tag, defn))
1063
- idx += 1
1064
- lines.append("") # blank line between groups
1065
-
1066
- return "\n".join(lines), flat
1067
-
1068
-
1069
- STRUCTURAL_SYSTEM_TEMPLATE = """You classify image descriptions by selecting true statements from a numbered list.
1070
-
1071
- The statements are organized into GROUPS. Each group header tells you how many to pick:
1072
- - "pick EXACTLY ONE" = choose the single best match in that group
1073
- - "pick ALL that apply" = choose every statement that is true
1074
-
1075
- IMPORTANT RULES:
1076
- 1. ONLY select a statement if the description directly says it or makes it very obvious.
1077
- 2. Do NOT guess or assume things the description does not mention.
1078
- 3. For body type: "anthro" means an ANIMAL with a human-shaped body (walks upright, has hands, but still has fur/tail/muzzle). "humanoid" means HUMAN or human-like with NO animal features. A wolf standing on two legs = anthro, NOT humanoid.
1079
- 4. For gender: only select male/female/intersex when there is explicit textual evidence (such as gender words or pronouns). Do not infer gender from species, body shape, clothing, or style. If no reliable gender cue is present, do not select male/female/intersex; use ambiguous_gender instead.
1080
- 5. For clothing state: READ CAREFULLY! "topless" = bare chest, wearing pants. "bottomless" = wearing shirt, no pants. If unsure, re-read the description.
1081
- 6. If clothing is not mentioned, do NOT pick any clothing statement.
1082
-
1083
- Return JSON ONLY:
1084
- {{"selections": [{{"i": 1}}, {{"i": 5}}]}}
1085
-
1086
- EXAMPLE:
1087
- Description: "A muscular male wolf standing in a forest, wearing jeans, giving a thumbs up"
1088
- Answer: {{"selections": [{{"i": 2}}, {{"i": 6}}, {{"i": 10}}, {{"i": 14}}]}}
1089
- Why: One character = solo (2). Wolf standing upright with hands = anthro (6), NOT humanoid because it is a wolf. Male (10). Wearing jeans = clothed (14)."""
1090
-
1091
- STRUCTURAL_USER_TEMPLATE = """Read this image description and select which statements are true.
1092
-
1093
- IMAGE DESCRIPTION:
1094
- {image_description}
1095
-
1096
- STATEMENTS (pick by number):
1097
- {statement_lines}"""
1098
-
1099
-
1100
- class StructuralSelectionItem(BaseModel):
1101
- i: int = Field(..., description="1-based index into the statement list.")
1102
-
1103
-
1104
- class StructuralSelectionResponse(BaseModel):
1105
- selections: List[StructuralSelectionItem] = Field(default_factory=list)
1106
-
1107
-
1108
- def _build_structural_response_format() -> Dict[str, Any]:
1109
- schema = {
1110
- "type": "object",
1111
- "properties": {
1112
- "selections": {
1113
- "type": "array",
1114
- "items": {
1115
- "type": "object",
1116
- "properties": {
1117
- "i": {"type": "integer"},
1118
- },
1119
- "required": ["i"],
1120
- "additionalProperties": False,
1121
- },
1122
- }
1123
- },
1124
- "required": ["selections"],
1125
- "additionalProperties": False,
1126
- }
1127
- return {
1128
- "type": "json_schema",
1129
- "json_schema": {
1130
- "name": "structural_selection",
1131
- "strict": True,
1132
- "schema": schema,
1133
- },
1134
- }
1135
-
1136
-
1137
- # Cache the loaded groups so we only read JSON files once per process.
1138
- _cached_structural_groups: Optional[List[StructuralGroup]] = None
1139
-
1140
-
1141
- def _get_structural_groups() -> List[StructuralGroup]:
1142
- global _cached_structural_groups
1143
- if _cached_structural_groups is None:
1144
- _cached_structural_groups = _load_structural_groups()
1145
- return _cached_structural_groups
1146
-
1147
-
1148
- def llm_infer_structural_tags(
1149
- query_text: str,
1150
- log=None,
1151
- *,
1152
- temperature: float = 0.0,
1153
- max_tokens: int = 512,
1154
- retries: int = 2,
1155
- ) -> List[str]:
1156
- """Infer structural tags via LLM using group-based statement agreement.
1157
-
1158
- Probes multiple semantic groups (character count, body type, gender,
1159
- clothing state, visual elements) with definitions loaded from wiki data
1160
- where available.
1161
-
1162
- Returns a list of e621 tag strings (e.g. ["solo", "anthro", "male", "clothed"]).
1163
- """
1164
- if log:
1165
- log("Stage3s (structural): inferring structural tags via group-based statement agreement")
1166
-
1167
- groups = _get_structural_groups()
1168
- statement_lines, flat_tags = _build_structural_prompt(groups)
1169
- N = len(flat_tags)
1170
-
1171
- response_format = _build_structural_response_format()
1172
- llm = _get_llm(temperature=temperature, max_tokens=max_tokens,
1173
- response_format=response_format)
1174
- model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
1175
-
1176
- parser = PydanticOutputParser(pydantic_object=StructuralSelectionResponse)
1177
-
1178
- prompt = ChatPromptTemplate.from_messages(
1179
- [
1180
- ("system", STRUCTURAL_SYSTEM_TEMPLATE),
1181
- ("human", STRUCTURAL_USER_TEMPLATE),
1182
- ],
1183
- template_format="f-string",
1184
- )
1185
- chain = prompt | llm | parser
1186
-
1187
- if log:
1188
- group_summary = ", ".join(f"{g.name}({len(g.tags)})" for g in groups)
1189
- log(f"Stage3s: model={model_name} groups=[{group_summary}] total_statements={N}")
1190
-
1191
- for att in range(retries + 1):
1192
- try:
1193
- parsed = chain.invoke({
1194
- "N": N,
1195
- "image_description": query_text,
1196
- "statement_lines": statement_lines,
1197
- })
1198
-
1199
- if isinstance(parsed, BaseModel):
1200
- parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
1201
-
1202
- sels = parsed.get("selections", []) if isinstance(parsed, dict) else []
1203
- chosen_tags: List[str] = []
1204
- seen: Set[str] = set()
1205
- for item in sels:
1206
- idx = item.get("i") if isinstance(item, dict) else None
1207
- if not isinstance(idx, int) or idx < 1 or idx > N:
1208
- continue
1209
- tag = flat_tags[idx - 1][0]
1210
- if tag not in seen:
1211
- chosen_tags.append(tag)
1212
- seen.add(tag)
1213
-
1214
- if log:
1215
- tag_str = ", ".join(chosen_tags) if chosen_tags else "(none)"
1216
- log(f"Stage3s: attempt {att+1} selected {len(chosen_tags)} tags: {tag_str}")
1217
-
1218
- return chosen_tags
1219
-
1220
- except Exception as e:
1221
- if log:
1222
- log(f"Stage3s: attempt {att+1} error: {e}")
1223
-
1224
- if log:
1225
- log(f"Stage3s: gave up after {retries+1} attempts")
1226
- return []
1227
-
1228
-
1229
- # ---------------------------------------------------------------------------
1230
- # Stage 3p: Simplified high-precision probe tags
1231
- # ---------------------------------------------------------------------------
1232
- _cached_runtime_probe_tags: Optional[List[str]] = None
1233
- _cached_runtime_probe_rows: Optional[List[Dict[str, str]]] = None
1234
- _cached_runtime_probe_wiki_defs: Optional[Dict[str, str]] = None
1235
-
1236
- _PROBE_GLOSSARY_FALLBACKS: Dict[str, str] = {
1237
- "anthro": "Animal character with human-like body shape, usually upright with arms and hands.",
1238
- "canid": "Member of dog-family species (wolves, foxes, dogs, coyotes).",
1239
- "felid": "Member of cat-family species (cats, lions, tigers, leopards).",
1240
- "solo": "Exactly one character is present in the image.",
1241
- "duo": "Exactly two characters are present in the image.",
1242
- "group": "Four or more characters are present in the image.",
1243
- "<3": "Visible heart symbol in text or icon form.",
1244
- }
1245
-
1246
-
1247
- def _load_runtime_probe_rows(log=None) -> List[Dict[str, str]]:
1248
- global _cached_runtime_probe_rows
1249
- if _cached_runtime_probe_rows is not None:
1250
- return _cached_runtime_probe_rows
1251
-
1252
- csv_path = Path(__file__).resolve().parents[2] / "data" / "analysis" / "simplified_probe_tags.csv"
1253
- rows: List[Dict[str, str]] = []
1254
- if not csv_path.is_file():
1255
- if log:
1256
- log(f"Stage3p: probe CSV not found at {csv_path}; skipping probe step")
1257
- _cached_runtime_probe_rows = rows
1258
- return rows
1259
-
1260
- try:
1261
- with csv_path.open("r", encoding="utf-8", newline="") as f:
1262
- rows = list(csv.DictReader(f))
1263
- except Exception as e:
1264
- if log:
1265
- log(f"Stage3p: failed reading probe CSV: {e}")
1266
- rows = []
1267
-
1268
- _cached_runtime_probe_rows = rows
1269
- return rows
1270
-
1271
-
1272
- def _load_runtime_probe_wiki_defs() -> Dict[str, str]:
1273
- global _cached_runtime_probe_wiki_defs
1274
- if _cached_runtime_probe_wiki_defs is not None:
1275
- return _cached_runtime_probe_wiki_defs
1276
-
1277
- data_dir = Path(__file__).resolve().parents[2] / "data"
1278
- wiki_path = data_dir / "tag_wiki_defs.json"
1279
- defs: Dict[str, str] = {}
1280
- if wiki_path.is_file():
1281
- try:
1282
- with wiki_path.open("r", encoding="utf-8") as f:
1283
- defs = _json.load(f)
1284
- except Exception:
1285
- defs = {}
1286
- _cached_runtime_probe_wiki_defs = defs
1287
- return defs
1288
-
1289
-
1290
- def _load_runtime_probe_tags(log=None) -> List[str]:
1291
- """Load runtime probe tags from analysis output.
1292
-
1293
- Preference order:
1294
- 1) selected_final=1 (reliability-gated list)
1295
- 2) selected_initial=1 (fallback if reliability file not built)
1296
- """
1297
- global _cached_runtime_probe_tags
1298
- if _cached_runtime_probe_tags is not None:
1299
- return _cached_runtime_probe_tags
1300
-
1301
- rows = _load_runtime_probe_rows(log=log)
1302
- tags: List[str] = []
1303
-
1304
- def _is_on(v: str) -> bool:
1305
- return (v or "").strip() in {"1", "true", "True"}
1306
-
1307
- final = [r.get("tag", "").strip() for r in rows if _is_on(r.get("selected_final", ""))]
1308
- initial = [r.get("tag", "").strip() for r in rows if _is_on(r.get("selected_initial", ""))]
1309
- tags = [t for t in (final if final else initial) if t]
1310
-
1311
- _cached_runtime_probe_tags = tags
1312
- if log and tags:
1313
- log(f"Stage3p: loaded {len(tags)} probe tags")
1314
- return tags
1315
-
1316
-
1317
- def _is_real_wiki_def(text: str) -> bool:
1318
- t = (text or "").strip()
1319
- if not t:
1320
- return False
1321
- if t.lower().startswith("thumb "):
1322
- return False
1323
- return len(t) >= 20
1324
-
1325
-
1326
- def _clean_glossary_text(text: str) -> str:
1327
- t = " ".join((text or "").replace("\n", " ").replace("\r", " ").split())
1328
- if len(t) > 160:
1329
- t = t[:157].rstrip() + "..."
1330
- return t
1331
-
1332
-
1333
- def _build_probe_candidate_display(probe_tags: Sequence[str], log=None) -> Dict[str, str]:
1334
- rows = _load_runtime_probe_rows(log=log)
1335
- rows_by_tag = {r.get("tag", "").strip(): r for r in rows}
1336
- wiki_defs = _load_runtime_probe_wiki_defs()
1337
-
1338
- display: Dict[str, str] = {}
1339
- for tag in probe_tags:
1340
- base = _display_tag(tag)
1341
- row = rows_by_tag.get(tag, {})
1342
- needs_glossary = (row.get("needs_glossary", "") or "").strip() in {"1", "true", "True"}
1343
- if not needs_glossary:
1344
- display[tag] = base
1345
- continue
1346
-
1347
- raw_def = wiki_defs.get(tag, "")
1348
- if not _is_real_wiki_def(raw_def):
1349
- raw_def = _PROBE_GLOSSARY_FALLBACKS.get(tag, "")
1350
- gloss = _clean_glossary_text(raw_def)
1351
- display[tag] = f"{base} - {gloss}" if gloss else base
1352
-
1353
- return display
1354
-
1355
-
1356
  def llm_infer_probe_tags(
1357
- query_text: str,
1358
- log=None,
1359
- *,
1360
- temperature: float = 0.0,
1361
- max_tokens: int = 512,
1362
- retries: int = 2,
1363
- min_why: Optional[str] = "explicit",
1364
  ) -> List[str]:
1365
- """Infer high-precision probe tags from a fixed reliability-gated tag list."""
1366
- probe_tags = _load_runtime_probe_tags(log=log)
1367
- if not probe_tags:
1368
- return []
1369
-
1370
  if log:
1371
- log(f"Stage3p: probing {len(probe_tags)} tags (min_why={min_why or 'none'})")
1372
- candidate_display = _build_probe_candidate_display(probe_tags, log=log)
1373
-
1374
- out = llm_select_indices(
1375
- query_text=query_text,
1376
- candidates=probe_tags,
1377
- max_pick=len(probe_tags),
1378
- log=log,
1379
- retries=retries,
1380
- mode="single_shot",
1381
- chunk_size=max(1, len(probe_tags)),
1382
- per_phrase_k=max(1, len(probe_tags)),
1383
- temperature=temperature,
1384
- max_tokens=max_tokens,
1385
- return_metadata=False,
1386
- return_diagnostics=False,
1387
- min_why=min_why,
1388
  candidate_display=candidate_display,
1389
  )
1390
-
1391
- selected: List[str] = []
1392
- for i in out:
1393
- if 0 <= i < len(probe_tags):
1394
- selected.append(probe_tags[i])
1395
-
1396
- if log:
1397
- shown = ", ".join(selected) if selected else "(none)"
1398
- log(f"Stage3p: selected {len(selected)} probe tags: {shown}")
1399
- return selected
 
1
+ # psq_rag/llm/select.py
2
+ # Stage 3: Closed-Set Selection (LangChain-only implementation)
3
+ #
4
+ # This module intentionally uses LangChain for:
5
+ # - prompt templating (including {N})
6
+ # - LLM call orchestration
7
+ # - JSON parsing
8
+ #
9
+ # There is NO fallback path. If LangChain dependencies are missing, this module
10
+ # should fail loudly so you install them.
11
+
12
+ import os
13
+ import re
14
+ import csv
15
  from dataclasses import dataclass
16
  from pathlib import Path
17
+ from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast, Mapping
18
+
19
+ from langchain_openai import ChatOpenAI
20
+ from langchain_core.prompts import ChatPromptTemplate
21
+ from langchain_core.output_parsers import PydanticOutputParser
22
+ from pydantic import BaseModel, Field, SecretStr
23
+ from rapidfuzz import fuzz
24
+
25
+ from psq_rag.retrieval.psq_retrieval import Candidate # Candidate(tag, score_*, count, sources)
26
+ from psq_rag.retrieval.state import get_tag_type_name, get_tag2aliases
27
+
28
+ # Character-typed tags that are generic categories, not actual named characters.
29
+ # These leak through the alias filter because they match common words in captions.
30
+ # They are excluded from the entity pipeline and instead routed to general selection.
31
+ _GENERIC_CHARACTER_TAGS = frozenset({
32
+ "fan_character",
33
+ "background_character",
34
+ "unnamed_character",
35
+ "unknown_character",
36
+ "anonymous_character",
37
+ "viewer",
38
+ "original_character",
39
+ })
40
+
41
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # IMPORTANT ABOUT TEMPLATING:
43
  # - This string is rendered by LangChain's f-string template engine.
44
  # - Literal JSON braces must be escaped as {{ and }}.
45
  # - {N} is a real template variable and MUST be provided.
46
  SELECT_SYSTEM_TEMPLATE = """You are given a description of an image and a list of imageboard tags.
47
 
48
+ Select tags ONLY when they are explicitly stated in the image description text.
49
+ Do NOT select tags based on implication, plausibility, style assumptions, or world knowledge.
50
+ If a tag is not directly supported by explicit wording in the description, do not select it.
51
 
52
  Return JSON ONLY matching this schema:
53
 
54
  {{
55
  \"selections\": [
56
+ {{\"i\": <int>}},
57
  ...
58
  ]
59
  }}
 
61
  Rules:
62
  - Choose ONLY from indices 1..{N}.
63
  - Do NOT output tag text.
64
+ - Do NOT output any keys other than \"selections\", and inside each item only the item index \"i\".
65
  - Do select both a general tag and a more specific tag when both apply (for example, \"shirt\" and \"grey shirt\").
 
 
 
 
 
 
 
66
  """
67
+
68
+
69
+ def _get_select_system_template() -> str:
70
+ """Return Stage 3 selection prompt text."""
71
+ return SELECT_SYSTEM_TEMPLATE
72
+
73
+
74
  ENTITY_SYSTEM_TEMPLATE = """You are given a description of an image and a list of CHARACTER tags.
75
+
76
+ These character tags have already been pre-filtered to only include characters whose names
77
+ (or known aliases) appear in the image description. Your job is to confirm which of these
78
+ pre-filtered candidates are the correct match for the character mentioned by the user.
79
+
80
  Return JSON ONLY matching this schema:
81
 
82
  {{
83
  \"selections\": [
84
+ {{\"i\": <int>}},
85
  ...
86
  ]
87
  }}
 
89
  Rules for character selection:
90
  - Choose ONLY from indices 1..{N}.
91
  - Do NOT output tag text.
 
92
  - Select the tag that best represents the character as described.
93
  - If the user described a specific variant (e.g. \"pikachu libre\", \"detective pikachu\"),
94
  select that specific variant tag.
95
+ - If the user described only the base character (e.g. just \"pikachu\"), select only the
96
+ base/default tag, NOT costume or variant tags.
97
+ - When uncertain between variants, prefer the simplest/most general tag.
98
+ """
99
+
100
+
101
+ USER_TEMPLATE = """IMAGE DESCRIPTION:
102
+ {image_description}
103
+
104
+ CANDIDATES (choose by index only):
105
+ {candidate_lines}
106
+
107
+ Select up to {per_call_budget} indices. Output fewer if uncertain.
108
+ """
109
+
110
+
111
  @dataclass(frozen=True)
112
  class Selected:
113
  i: int
114
  tag: str # canonical tag (underscore form)
 
 
 
 
 
115
 
116
 
117
  class Stage3SelectionItem(BaseModel):
118
  i: int = Field(..., description="1-based index into the candidate list.")
 
119
 
120
 
121
  class Stage3SelectionResponse(BaseModel):
122
+ selections: List[Stage3SelectionItem] = Field(default_factory=list)
123
+
124
+
125
+ def _build_response_format() -> Dict[str, Any]:
126
+ # Strict JSON Schema structured output.
127
+ schema = {
128
+ "type": "object",
129
+ "properties": {
130
+ "selections": {
131
+ "type": "array",
132
  "items": {
133
  "type": "object",
134
  "properties": {
135
  "i": {"type": "integer"},
 
136
  },
137
+ "required": ["i"],
138
  "additionalProperties": False,
139
  },
140
  }
141
+ },
142
+ "required": ["selections"],
143
+ "additionalProperties": False,
144
+ }
145
+
146
+ return {
147
+ "type": "json_schema",
148
+ "json_schema": {
149
+ "name": "stage3_selection",
150
+ "strict": True,
151
+ "schema": schema,
152
+ },
153
+ }
154
+
155
+
156
+ def _get_llm(*, temperature: float, max_tokens: int, response_format: Dict[str, Any]) -> ChatOpenAI:
157
+ api_key = os.getenv("OPENROUTER_API_KEY")
158
+ if not api_key:
159
+ raise RuntimeError(
160
+ "OPENROUTER_API_KEY is not set.\n"
161
+ "Set it in your environment before running Stage 3."
162
+ )
163
+ api_key = SecretStr(cast(str, api_key))
164
+
165
+ model = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
166
+ headers: Dict[str, str] = {}
167
+ if referer := os.getenv("OPENROUTER_HTTP_REFERER"):
168
+ headers["HTTP-Referer"] = referer
169
+ if title := os.getenv("OPENROUTER_X_TITLE"):
170
+ headers["X-Title"] = title
171
+
172
+ # OpenRouter OpenAI-compatible endpoint.
173
+ return ChatOpenAI(
174
+ model=model,
175
+ base_url="https://openrouter.ai/api/v1",
176
+ api_key=api_key,
177
+ temperature=temperature,
178
+ max_completion_tokens=max_tokens,
179
+ default_headers=headers,
180
+ # Provider-specific request body fields (OpenAI-compatible).
181
+ # Response Healing plugin reduces malformed-JSON failures (syntax only).
182
+ extra_body={
183
+ "response_format": response_format,
184
+ "plugins": [{"id": "response-healing"}],
185
+ },
186
+ )
187
+
188
+
189
+ def _phrase_key_for_candidate(c: Candidate) -> str:
190
+ # Deterministic "primary phrase" for grouping.
191
+ if c.sources:
192
+ return sorted(c.sources)[0]
193
+ return ""
194
+
195
+
196
+ def _interleave_round_robin(cands: Sequence[Candidate]) -> List[Candidate]:
197
+ """Round-robin interleave by primary source phrase.
198
+
199
+ NOTE: counts are used only for ordering; they are NOT shown to the LLM.
200
+ """
201
+ groups: Dict[str, List[Candidate]] = {}
202
+ for c in cands:
203
+ k = _phrase_key_for_candidate(c)
204
+ groups.setdefault(k, []).append(c)
205
+
206
+ for k in groups:
207
+ groups[k].sort(key=lambda x: (x.score_combined, (x.count or -1)), reverse=True)
208
+
209
+ keys = sorted(groups.keys())
210
+
211
+ out: List[Candidate] = []
212
+ idx = 0
213
+ while True:
214
+ progressed = False
215
+ for k in keys:
216
+ if idx < len(groups[k]):
217
+ out.append(groups[k][idx])
218
+ progressed = True
219
+ if not progressed:
220
+ break
221
+ idx += 1
222
+
223
+ return out
224
+
225
+
226
+ def _build_chunks(cands: Sequence[Candidate], chunk_size: int) -> List[List[Candidate]]:
227
+ if chunk_size <= 0:
228
+ raise ValueError(f"chunk_size must be > 0, got {chunk_size}")
229
+ ordered = _interleave_round_robin(cands)
230
+ return [ordered[i:i + chunk_size] for i in range(0, len(ordered), chunk_size)]
231
+
232
+
233
+ def _display_tag(tag: str) -> str:
234
+ # Display tags with spaces for the LLM, but keep canonical underscores internally.
235
+ return tag.replace("_", " ")
236
+
237
+
238
+ def _format_candidates_local(
239
+ cands: Sequence[Candidate],
240
+ candidate_display: Optional[Mapping[str, str]] = None,
241
+ ) -> Tuple[str, Dict[int, str], Dict[int, Candidate]]:
242
+ lines: List[str] = []
243
+ idx_to_tag: Dict[int, str] = {}
244
+ idx_to_candidate: Dict[int, Candidate] = {}
245
+ for j, c in enumerate(cands, start=1):
246
+ idx_to_tag[j] = c.tag
247
+ idx_to_candidate[j] = c
248
+ display = candidate_display.get(c.tag) if candidate_display else None
249
+ if not display:
250
+ display = _display_tag(c.tag)
251
+ lines.append(f"{j}. {display}")
252
+ return "\n".join(lines), idx_to_tag, idx_to_candidate
253
+
254
+
255
+ def _phrases_in_call(cands: Sequence[Candidate]) -> int:
256
+ s = set()
257
+ for c in cands:
258
+ for src in c.sources:
259
+ s.add(src)
260
+ return len(s)
261
+
262
+
263
  def _parse_validate_map(
264
  parsed: Any,
265
  idx_to_tag: Dict[int, str],
266
  per_call_budget: int,
267
+ ) -> Tuple[List[Selected], Dict[str, Any]]:
268
+ diag = {
269
+ "parse_ok": isinstance(parsed, dict),
270
+ "invalid_items": 0,
271
+ "oob_indices": 0,
272
+ "dupe_indices": 0,
273
+ "kept": 0,
274
+ }
275
+
276
+ if isinstance(parsed, BaseModel):
277
+ parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
278
+ diag["parse_ok"] = isinstance(parsed, dict)
279
+
280
+ if not isinstance(parsed, dict):
281
+ return [], diag
282
+
283
+ selections = parsed.get("selections", [])
284
+ if not isinstance(selections, list):
285
+ diag["parse_ok"] = False
286
+ return [], diag
287
+
288
+ out: List[Selected] = []
289
+ seen_i = set()
290
+
291
  for item in selections:
292
  if len(out) >= per_call_budget:
293
  break
 
296
  continue
297
 
298
  i = item.get("i")
 
299
 
300
  if isinstance(i, bool) or not isinstance(i, int):
301
  diag["invalid_items"] += 1
 
306
  if i not in idx_to_tag:
307
  diag["oob_indices"] += 1
308
  continue
309
+ extra_keys = set(item.keys()) - {"i"}
310
+ if extra_keys:
311
  diag["invalid_items"] += 1
312
  continue
313
  seen_i.add(i)
314
  tag = idx_to_tag[i]
315
+ out.append(Selected(i=i, tag=tag))
316
+
317
+ diag["kept"] = len(out)
318
+ return out, diag
319
+
320
+
321
+ def _split_candidates_by_type(
322
+ candidates: List[Candidate],
323
+ log,
324
+ ) -> Tuple[List[Tuple[int, Candidate]], List[Tuple[int, Candidate]]]:
325
+ """Split candidates into general vs entity (character only) lists.
326
+
327
+ Returns:
328
+ (general_list, entity_list) where each item is (original_index, candidate)
329
+
330
+ Tag types:
331
+ - General: 0 (general), 1 (artist), 5 (species), 7 (meta)
332
+ - Entity: 4 (character) only
333
+ - Filtered: 3 (copyright) - too broad for image generation
334
+ """
335
+ general_with_idx: List[Tuple[int, Candidate]] = []
336
+ entity_with_idx: List[Tuple[int, Candidate]] = []
337
+
338
+ unknown_count = 0
339
+ copyright_count = 0
340
+
341
+ generic_char_count = 0
342
+
343
+ for idx, cand in enumerate(candidates):
344
+ type_name = get_tag_type_name(cand.tag)
345
+
346
+ if type_name == "character":
347
+ if cand.tag in _GENERIC_CHARACTER_TAGS:
348
+ # Route generic character-category tags to general selection
349
+ general_with_idx.append((idx, cand))
350
+ generic_char_count += 1
351
+ else:
352
+ entity_with_idx.append((idx, cand))
353
+ elif type_name == "copyright":
354
+ # Filter out copyright/series tags - too broad for image generation
355
+ copyright_count += 1
356
+ elif type_name in ("general", "artist", "species", "meta"):
357
+ general_with_idx.append((idx, cand))
358
+ else:
359
+ # Unknown or None - treat as general by default
360
+ general_with_idx.append((idx, cand))
361
+ unknown_count += 1
362
+
363
+ if log:
364
+ log(
365
+ f"Stage3 split: "
366
+ f"general={len(general_with_idx)} "
367
+ f"entity={len(entity_with_idx)} "
368
+ f"copyright_filtered={copyright_count} "
369
+ f"generic_char_to_general={generic_char_count} "
370
+ f"unknown_type={unknown_count}"
371
+ )
372
+
373
+ return general_with_idx, entity_with_idx
374
+
375
+
376
+ # Regex to strip series/franchise suffixes from aliases, e.g. _(sonic), _(mlp), _(character)
377
+ _SERIES_SUFFIX_RE = re.compile(r"_\([^)]+\)$")
378
+
379
+
380
+ def _normalize_for_matching(text: str) -> str:
381
+ """Lowercase, replace underscores with spaces, strip series suffixes."""
382
+ text = text.lower().strip()
383
+ text = _SERIES_SUFFIX_RE.sub("", text)
384
+ text = text.replace("_", " ")
385
+ return text
386
+
387
+
388
+ def _query_words(query: str) -> Set[str]:
389
+ """Extract individual words from the user query for matching."""
390
+ return set(_normalize_for_matching(query).split())
391
+
392
+
393
+ def _alias_matches_query(alias_norm: str, query_words: Set[str], query_norm: str,
394
+ fuzzy_threshold: int = 85) -> bool:
395
+ """Check if an alias matches the user query.
396
+
397
+ Matching logic:
398
+ 1. Exact substring: alias appears as a substring of the query
399
+ 2. Word subset: all words in the alias appear in the query words
400
+ 3. Fuzzy: alias is close to a word in the query (handles typos)
401
+ """
402
+ # Exact substring match
403
+ if alias_norm in query_norm:
404
+ return True
405
+
406
+ alias_words = alias_norm.split()
407
+ if not alias_words:
408
+ return False
409
+
410
+ # Word subset match: all alias words must appear in query
411
+ if all(w in query_words for w in alias_words):
412
+ return True
413
+
414
+ # For single-word aliases, try fuzzy matching against each query word
415
+ if len(alias_words) == 1:
416
+ for qw in query_words:
417
+ if fuzz.ratio(alias_words[0], qw) >= fuzzy_threshold:
418
+ return True
419
+
420
+ # For multi-word aliases, try fuzzy partial ratio against whole query
421
+ if len(alias_words) > 1:
422
+ if fuzz.partial_ratio(alias_norm, query_norm) >= fuzzy_threshold:
423
+ return True
424
+
425
+ return False
426
+
427
+
428
+ def _character_matches_via_aliases(
429
+ tag: str,
430
+ query: str,
431
+ tag2aliases: Dict[str, List[str]],
432
+ query_words: Set[str],
433
+ query_norm: str,
434
+ fuzzy_threshold: int = 85,
435
+ ) -> bool:
436
+ """Check if a character tag matches the user query via its aliases.
437
+
438
+ For a character tag to match:
439
+ - The tag name itself (normalized) must match, OR
440
+ - At least one of its registered aliases must match.
441
+
442
+ Empty aliases list means no known aliases; still check the tag name itself.
443
+ """
444
+ # Check the tag name itself
445
+ tag_norm = _normalize_for_matching(tag)
446
+ if _alias_matches_query(tag_norm, query_words, query_norm, fuzzy_threshold):
447
+ return True
448
+
449
+ # Check all registered aliases
450
+ aliases = tag2aliases.get(tag, [])
451
+ for alias in aliases:
452
+ alias_norm = _normalize_for_matching(alias)
453
+ if not alias_norm:
454
+ continue
455
+ if _alias_matches_query(alias_norm, query_words, query_norm, fuzzy_threshold):
456
+ return True
457
+
458
+ return False
459
+
460
+
461
  def llm_select_indices(
462
+ query_text: str, # kept for compatibility; treated as IMAGE DESCRIPTION
463
+ candidates: Union[
464
+ Sequence[Candidate],
465
+ Sequence[str],
466
+ Sequence[Tuple[str, float]],
467
+ ],
468
+ max_pick: int, # legacy param; applied after union + ordering (optional)
469
+ log,
470
+ retries: int = 2,
471
+ *,
472
+ mode: str = "chunked_map_union", # "single_shot" or "chunked_map_union"
473
+ chunk_size: int = 60,
474
+ per_phrase_k: int = 2, # per-call budget = per_phrase_k * phrases_in_call
475
+ temperature: float = 0.0,
476
+ max_tokens: int = 512,
477
+ return_metadata: bool = False,
478
+ return_diagnostics: bool = False,
479
+ min_why: Optional[str] = None,
480
+ candidate_display: Optional[Mapping[str, str]] = None,
481
+ ) -> Union[
482
+ List[int],
483
+ Tuple[List[int], Dict[str, str]],
484
+ Tuple[List[int], Dict[str, str], Dict[str, Any]],
485
+ ]:
486
+ """Return indices into the ORIGINAL candidates list (legacy interface).
487
+
488
+ min_why: legacy compatibility argument; ignored in explicit-only mode.
489
+
490
+ This implementation uses LangChain ONLY.
491
+
492
+ NOTE: query_text is treated as the image description (original prompt).
493
+ """
494
+
495
+ image_description = query_text
496
+
497
+ # Normalize candidates:
498
+ # - preferred: List[Candidate]
499
+ # - legacy: List[(tag, sim)] (count/sources unavailable)
500
+ norm: List[Candidate] = []
501
+ tag_to_first_index: Dict[str, int] = {}
502
+
503
+ branch = "empty"
504
+ cand0_type = type(candidates[0]).__name__ if candidates else "none"
505
+
506
+ if candidates and isinstance(candidates[0], Candidate):
507
+ branch = "candidate"
508
+ typed_candidates = cast(Sequence[Candidate], candidates)
509
+ for idx, c in enumerate(typed_candidates):
510
+ if c.tag not in tag_to_first_index:
511
+ tag_to_first_index[c.tag] = idx
512
+ norm.append(c)
513
+ elif candidates and isinstance(candidates[0], str):
514
+ branch = "string"
515
+ typed_candidates = cast(Sequence[str], candidates)
516
+ for idx, tag in enumerate(typed_candidates):
517
+ if tag not in tag_to_first_index:
518
+ tag_to_first_index[tag] = idx
519
+ norm.append(
520
+ Candidate(
521
+ tag=tag,
522
+ score_combined=0.0,
523
+ score_fasttext=None,
524
+ score_context=None,
525
+ count=None,
526
+ sources=[],
527
+ )
528
+ )
529
+ else:
530
+ if candidates:
531
+ branch = "tuple"
532
+ typed_candidates = cast(Sequence[Tuple[str, float]], candidates)
533
+ for idx, row in enumerate(typed_candidates):
534
+ if not isinstance(row, (list, tuple)) or len(row) < 2:
535
+ raise ValueError("Stage 3 candidates must be Candidate, tag strings, or (tag, score) tuples.")
536
+ tag, sim = row[0], row[1]
537
+ if tag not in tag_to_first_index:
538
+ tag_to_first_index[tag] = idx
539
+ norm.append(
540
+ Candidate(
541
+ tag=tag,
542
+ score_combined=float(sim),
543
+ score_fasttext=None,
544
+ score_context=None,
545
+ count=None,
546
+ sources=[],
547
+ )
548
+ )
549
+
550
+ if log:
551
+ if norm:
552
+ log(
553
+ "Stage3 input: "
554
+ f"type0={cand0_type} "
555
+ f"branch={branch} "
556
+ f"norm0_score={norm[0].score_combined!r} "
557
+ f"norm0_sources_empty={not bool(norm[0].sources)}"
558
+ )
559
+ else:
560
+ log(f"Stage3 input: type0={cand0_type} branch={branch} (no candidates)")
561
+
562
+ if mode not in ("single_shot", "chunked_map_union"):
563
+ raise ValueError(f"Invalid mode: {mode}")
564
+
565
+ response_format = _build_response_format()
566
+ llm = _get_llm(temperature=temperature, max_tokens=max_tokens, response_format=response_format)
567
+ model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
568
+
569
+ parser = PydanticOutputParser(pydantic_object=Stage3SelectionResponse)
570
+ select_system_template = _get_select_system_template()
571
+
572
+ # Global union of selected tags across calls.
573
+ best_tags: Set[str] = set()
574
+ diagnostics: Dict[str, Any] = {
575
+ "mode": mode,
576
+ "chunk_strategy": "interleave",
577
+ "chunk_passes": 1,
578
+ "chunk_shuffle_within_call": False,
579
+ "calls_total": 0,
580
+ "calls_with_selection": 0,
581
+ "calls_exhausted_retries": 0,
582
+ "attempts_total": 0,
583
+ "attempt_errors": 0,
584
+ "attempt_parse_fail": 0,
585
+ "attempt_parse_ok": 0,
586
+ "invalid_items_total": 0,
587
+ "oob_indices_total": 0,
588
+ "dupe_indices_total": 0,
589
+ "kept_total": 0,
590
+ "attempts_by_n_local": {},
591
+ }
592
+
593
+ def _record_attempt_for_n(n_local: int, *, parse_ok: bool, error: bool) -> None:
594
+ by_n = diagnostics["attempts_by_n_local"]
595
+ key = str(n_local)
596
+ if key not in by_n:
597
+ by_n[key] = {
598
+ "attempts": 0,
599
+ "parse_ok": 0,
600
+ "parse_fail": 0,
601
+ "errors": 0,
602
+ }
603
+ by_n[key]["attempts"] += 1
604
+ if error:
605
+ by_n[key]["errors"] += 1
606
+ elif parse_ok:
607
+ by_n[key]["parse_ok"] += 1
608
+ else:
609
+ by_n[key]["parse_fail"] += 1
610
+
611
+ def run_call(call_cands: Sequence[Candidate], label: str, system_template: str) -> None:
612
+ # Create chain with the provided system template
613
+ prompt = ChatPromptTemplate.from_messages(
614
+ [
615
+ ("system", system_template),
616
+ ("human", USER_TEMPLATE),
617
+ ],
618
+ template_format="f-string",
619
+ )
620
+ chain = prompt | llm | parser
621
+
622
+ ordered = _interleave_round_robin(call_cands) if mode == "single_shot" else list(call_cands)
623
+ candidate_lines, idx_to_tag, idx_to_candidate = _format_candidates_local(
624
+ ordered,
625
+ candidate_display=candidate_display,
626
+ )
627
+ N_local = len(idx_to_tag)
628
+ diagnostics["calls_total"] += 1
629
+
630
+ phrases = _phrases_in_call(call_cands)
631
+ per_call_budget = max(1, per_phrase_k * phrases) if phrases > 0 else per_phrase_k
632
+ summary_logged = False
633
+
634
+ if log:
635
+ log(f"Stage3 {label}: candidates (local indices):\n{candidate_lines}")
636
+ if phrases > 0:
637
+ distinct_phrases = sorted({src for c in call_cands for src in c.sources})
638
+ log(
639
+ f"Stage3 {label}: distinct_phrases={len(distinct_phrases)} "
640
+ f"phrases={', '.join(distinct_phrases)}"
641
+ )
642
+
643
+ # Invoke LangChain chain (templating fills {N} and other vars)
644
+ for att in range(retries + 1):
645
+ try:
646
+ diagnostics["attempts_total"] += 1
647
+ if log:
648
+ log(
649
+ f"Stage3 {label}: "
650
+ f"model={model_name} "
651
+ f"N={N_local} "
652
+ f"phrases={phrases} "
653
+ f"per_call_budget={per_call_budget} "
654
+ f"response_healing=on"
655
+ )
656
+
657
+ parsed = chain.invoke(
658
+ {
659
+ "N": N_local,
660
+ "image_description": image_description,
661
+ "candidate_lines": candidate_lines,
662
+ "per_call_budget": per_call_budget,
663
+ }
664
+ )
665
+ selected, diag = _parse_validate_map(parsed, idx_to_tag, per_call_budget=per_call_budget)
666
+ diagnostics["invalid_items_total"] += int(diag.get("invalid_items", 0))
667
+ diagnostics["oob_indices_total"] += int(diag.get("oob_indices", 0))
668
+ diagnostics["dupe_indices_total"] += int(diag.get("dupe_indices", 0))
669
+ diagnostics["kept_total"] += int(diag.get("kept", 0))
670
+ if bool(diag.get("parse_ok", False)):
671
+ diagnostics["attempt_parse_ok"] += 1
672
+ _record_attempt_for_n(N_local, parse_ok=True, error=False)
673
+ else:
674
+ diagnostics["attempt_parse_fail"] += 1
675
+ _record_attempt_for_n(N_local, parse_ok=False, error=False)
676
+ if log:
677
+ log(f"Stage3 {label}: attempt {att+1} diag={diag}")
678
+ if not summary_logged and (selected or att == retries):
679
+ log(
680
+ f"Stage3 {label}: summary "
681
+ f"N={N_local} selected={len(selected)} per_call_budget={per_call_budget}"
682
+ )
683
+ summary_logged = True
 
 
 
684
  if selected:
685
  lines = [
686
  f"Stage3 {label} selections:",
687
  *[
688
  (
689
  f' - i={s.i} tag="{s.tag}" '
 
690
  f"sources={idx_to_candidate.get(s.i).sources if idx_to_candidate.get(s.i) else []}"
691
  )
692
  for s in selected
693
  ],
694
  ]
695
+ log("\n".join(lines))
696
+ else:
697
+ log(f"Stage3 {label} selections: (none)")
698
+
699
  if selected:
700
  diagnostics["calls_with_selection"] += 1
701
  for s in selected:
702
+ best_tags.add(s.tag)
 
 
703
  return
704
+
705
+ except Exception as e:
706
+ diagnostics["attempt_errors"] += 1
707
+ _record_attempt_for_n(N_local, parse_ok=False, error=True)
708
+ if log:
709
+ log(f"Stage3 {label}: attempt {att+1} error: {e}")
710
+
711
+ if log:
712
+ log(f"Stage3 {label}: gave up after {retries+1} attempts")
713
+ diagnostics["calls_exhausted_retries"] += 1
714
+
715
+ # Split candidates by type (general vs entity)
716
+ general_with_idx, entity_with_idx = _split_candidates_by_type(norm, log)
717
+
718
+ # Extract just the candidates for LLM calls
719
+ general_cands = [cand for _, cand in general_with_idx]
720
+ entity_cands = [cand for _, cand in entity_with_idx]
721
+
722
+ # Process general candidates (attributes, actions, species, etc.)
723
+ if general_cands:
724
+ if mode == "single_shot":
725
+ run_call(general_cands, "general_single_shot", select_system_template)
726
+ else:
727
+ base_chunks = _build_chunks(general_cands, chunk_size)
728
+ for chunk_idx, chunk in enumerate(base_chunks):
729
+ run_call(chunk, f"general_chunk_{chunk_idx}", select_system_template)
730
+
731
+ # Process entity candidates (characters only) with alias-based pre-filtering
732
+ if entity_cands:
733
+ tag2aliases = get_tag2aliases()
734
+ qwords = _query_words(image_description)
735
+ qnorm = _normalize_for_matching(image_description)
736
+
737
+ filtered_entity_cands: List[Candidate] = []
738
+ filtered_out: List[str] = []
739
+
740
+ for cand in entity_cands:
741
+ if _character_matches_via_aliases(
742
+ cand.tag, image_description, tag2aliases, qwords, qnorm
743
+ ):
744
+ filtered_entity_cands.append(cand)
745
+ else:
746
+ filtered_out.append(cand.tag)
747
+
748
+ if log:
749
+ log(
750
+ f"Stage3 entity alias filter: "
751
+ f"before={len(entity_cands)} "
752
+ f"after={len(filtered_entity_cands)} "
753
+ f"removed={len(filtered_out)}"
754
+ )
755
+ if filtered_out:
756
+ log(f"Stage3 entity alias filter removed: {filtered_out[:20]}")
757
+
758
+ if filtered_entity_cands:
759
+ if mode == "single_shot":
760
+ run_call(filtered_entity_cands, "entity_single_shot", ENTITY_SYSTEM_TEMPLATE)
761
+ else:
762
+ base_chunks = _build_chunks(filtered_entity_cands, chunk_size)
763
+ for chunk_idx, chunk in enumerate(base_chunks):
764
+ run_call(chunk, f"entity_chunk_{chunk_idx}", ENTITY_SYSTEM_TEMPLATE)
765
+
766
+ if min_why is not None and log:
767
+ log("Stage3: min_why is ignored in explicit-only no-why mode")
768
+
769
+ # Deterministic ordering: count desc (count not shown to LLM), then tag.
 
 
 
 
 
 
770
  count_by_tag = {c.tag: (c.count if c.count is not None else -1) for c in norm}
771
+ ordered_tags = sorted(best_tags, key=lambda t: (count_by_tag.get(t, -1), t), reverse=True)
772
+
773
+ # Legacy cap: apply AFTER union + ordering.
774
+ if isinstance(max_pick, int) and max_pick > 0:
775
+ ordered_tags = ordered_tags[:max_pick]
776
+
777
+ # Map back to original indices
778
+ out_idx: List[int] = []
779
  tag_why: Dict[str, str] = {}
780
  for t in ordered_tags:
781
  if t in tag_to_first_index:
782
  out_idx.append(tag_to_first_index[t])
783
+ # Why labels removed in explicit-only no-why mode.
784
+
785
+ if diagnostics["attempts_total"] > 0:
786
+ diagnostics["attempt_failure_rate"] = (
787
+ diagnostics["attempt_parse_fail"] + diagnostics["attempt_errors"]
788
+ ) / diagnostics["attempts_total"]
789
+ else:
790
+ diagnostics["attempt_failure_rate"] = 0.0
791
+
792
+ if diagnostics["calls_total"] > 0:
793
+ diagnostics["call_exhaustion_rate"] = (
794
+ diagnostics["calls_exhausted_retries"] / diagnostics["calls_total"]
795
+ )
796
+ else:
797
+ diagnostics["call_exhaustion_rate"] = 0.0
798
+
799
+ if return_metadata:
800
+ if return_diagnostics:
801
+ return out_idx, tag_why, diagnostics
802
+ return out_idx, tag_why
803
+
804
+ return out_idx
805
+
806
+
807
+ # ---------------------------------------------------------------------------
808
+ # Stage 3s: Structural tag inference (solo/duo/male/female/anthro/… )
809
+ # ---------------------------------------------------------------------------
810
+ # Group-based approach: tags are organized into semantic groups loaded from
811
+ # tag_groups.json / tag_wiki_defs.json where possible, with curated fallback
812
+ # definitions for tags whose wiki entries are only thumbnail references.
813
+ #
814
+ # Each group specifies a constraint mode:
815
+ # "exclusive" = pick exactly one (e.g. character count)
816
+ # "multi" = pick all that apply (e.g. body type, gender)
817
+
818
+ import json as _json
819
+
820
+ @dataclass
821
+ class StructuralGroup:
822
+ """One category of structural tags to probe."""
823
+ name: str
824
+ constraint: str # "exclusive" or "multi"
825
+ tags: List[Tuple[str, str]] # (tag, definition) pairs
826
+
827
+
828
+ def _load_structural_groups_from_csv() -> List[StructuralGroup]:
829
+ """Load structural groups from data/structural_tag_definitions.csv."""
830
+ data_dir = Path(__file__).resolve().parents[2] / "data"
831
+ csv_path = data_dir / "structural_tag_definitions.csv"
832
+ if not csv_path.is_file():
833
+ return []
834
+
835
+ groups_by_name: Dict[str, List[Tuple[str, str]]] = {}
836
+ constraints_by_name: Dict[str, str] = {}
837
+
838
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
839
+ reader = csv.DictReader(f)
840
+ for row in reader:
841
+ enabled = (row.get("enabled") or "1").strip().lower()
842
+ if enabled in {"0", "false", "no"}:
843
+ continue
844
+
845
+ group_name = (row.get("group_name") or "").strip()
846
+ constraint = (row.get("constraint") or "multi").strip().lower()
847
+ tag = (row.get("tag") or "").strip()
848
+ definition = " ".join((row.get("definition") or "").split())
849
+
850
+ if not group_name or not tag or not definition:
851
+ continue
852
+ if constraint not in {"exclusive", "multi"}:
853
+ constraint = "multi"
854
+
855
+ if group_name not in groups_by_name:
856
+ groups_by_name[group_name] = []
857
+ constraints_by_name[group_name] = constraint
858
+ groups_by_name[group_name].append((tag, definition))
859
+
860
+ out: List[StructuralGroup] = []
861
+ for group_name, tags in groups_by_name.items():
862
+ if not tags:
863
+ continue
864
+ out.append(
865
+ StructuralGroup(
866
+ name=group_name,
867
+ constraint=constraints_by_name.get(group_name, "multi"),
868
+ tags=tags,
869
+ )
870
+ )
871
+ return out
872
+
873
+ def _load_structural_groups() -> List[StructuralGroup]:
874
+ """Build structural groups from local config file with legacy fallback.
875
+
876
+ Preferred source:
877
+ data/structural_tag_definitions.csv
878
+ Fallback:
879
+ tag_wiki_defs.json + curated hardcoded defaults
880
+ """
881
+ csv_groups = _load_structural_groups_from_csv()
882
+ if csv_groups:
883
+ return csv_groups
884
+
885
+ data_dir = Path(__file__).resolve().parents[2] / "data"
886
+
887
+ # Load wiki definitions (may not exist yet)
888
+ wiki_defs: Dict[str, str] = {}
889
+ wiki_path = data_dir / "tag_wiki_defs.json"
890
+ if wiki_path.is_file():
891
+ with wiki_path.open("r", encoding="utf-8") as f:
892
+ wiki_defs = _json.load(f)
893
+
894
+ def _def(tag: str, fallback: str) -> str:
895
+ """Get wiki definition if it's real text, otherwise use fallback."""
896
+ d = wiki_defs.get(tag, "")
897
+ # Skip thumbnail-only definitions
898
+ if not d or d.startswith("thumb ") or len(d) < 15:
899
+ return fallback
900
+ return d[:200] # cap length for prompt
901
+
902
+ groups: List[StructuralGroup] = []
903
+
904
+ # ── Group A: Character Count (exclusive) ──
905
+ groups.append(StructuralGroup(
906
+ name="character_count",
907
+ constraint="exclusive",
908
+ tags=[
909
+ ("zero_pictured", _def("zero_pictured",
910
+ "No characters or living beings appear in the image")),
911
+ ("solo", _def("solo",
912
+ "Exactly one character appears in the image")),
913
+ ("duo", _def("duo",
914
+ "Exactly two characters appear in the image")),
915
+ ("trio", _def("trio",
916
+ "Exactly three characters appear in the image")),
917
+ ("group", _def("group",
918
+ "Four or more characters appear in the image")),
919
+ ],
920
+ ))
921
+
922
+ # ── Group B: Body Type (multi — per character) ──
923
+ # Key distinction the LLM must learn:
924
+ # anthro = ANIMAL with human body shape (upright, hands)
925
+ # humanoid = HUMAN or near-human (elf, dwarf) with NO animal features
926
+ # feral = normal animal shape, on all fours
927
+ groups.append(StructuralGroup(
928
+ name="body_type",
929
+ constraint="multi",
930
+ tags=[
931
+ ("anthro", _def("anthro",
932
+ "An animal character with a human-like body: walks upright on two legs, "
933
+ "has arms and hands. Examples: a wolf-person, a fox standing up. "
934
+ "Still has animal features like fur, tail, muzzle")),
935
+ ("feral", _def("feral",
936
+ "A regular animal in its natural body shape. Walks on all fours (or "
937
+ "flies/swims naturally). NOT standing upright, NOT humanized")),
938
+ ("humanoid", _def("humanoid",
939
+ "A human or human-like character with NO animal features. Includes "
940
+ "humans, elves, dwarves, and fantasy races that look human. "
941
+ "Does NOT include animal-people — those are anthro")),
942
+ ("taur", _def("taur",
943
+ "A centaur-like body: human or anthro upper body attached to a "
944
+ "four-legged animal lower body")),
945
+ ],
946
+ ))
947
+
948
+ # ── Group C: Gender (multi — per character) ──
949
+ groups.append(StructuralGroup(
950
+ name="gender",
951
+ constraint="multi",
952
+ tags=[
953
+ ("male", _def("male",
954
+ "A character described as male, a boy, or with he/him pronouns")),
955
+ ("female", _def("female",
956
+ "A character described as female, a girl, or with she/her pronouns")),
957
+ ("ambiguous_gender", _def("ambiguous_gender",
958
+ "A character whose gender is not stated or cannot be determined")),
959
+ ("intersex", _def("intersex",
960
+ "A character explicitly described as intersex or hermaphrodite")),
961
+ ],
962
+ ))
963
+
964
+ # ── Group D: Clothing State (multi) ──
965
+ groups.append(StructuralGroup(
966
+ name="clothing_state",
967
+ constraint="multi",
968
+ tags=[
969
+ ("clothed", _def("clothed",
970
+ "Wearing clothes on BOTH chest/torso AND legs/waist. "
971
+ "Examples: shirt and pants, dress, full outfit")),
972
+ ("nude", _def("nude",
973
+ "Wearing NO clothes at all. Completely naked, no shirt and no pants")),
974
+ ("topless", _def("topless",
975
+ "NO shirt/top (bare chest), BUT wearing pants/bottoms. "
976
+ "Upper body exposed, lower body covered")),
977
+ ("bottomless", _def("bottomless",
978
+ "Wearing shirt/top on chest, BUT NO pants/bottoms. "
979
+ "Upper body covered, lower body exposed")),
980
+ ],
981
+ ))
982
+
983
+ # ── Group E: Common Visual Elements (multi) ──
984
+ groups.append(StructuralGroup(
985
+ name="visual_elements",
986
+ constraint="multi",
987
+ tags=[
988
+ ("looking_at_viewer", _def("looking_at_viewer",
989
+ "A character is looking directly at the camera or viewer")),
990
+ ("text", _def("text",
991
+ "The image contains visible writing, words, or lettering")),
992
+ ],
993
+ ))
994
+
995
+ return groups
996
+
997
+
998
+ def _build_structural_prompt(groups: List[StructuralGroup]) -> Tuple[str, List[Tuple[str, str]]]:
999
+ """Build numbered statement list from structural groups.
1000
+
1001
+ Returns (formatted_text, flat_list_of_(tag, definition)_pairs).
1002
+ The flat list maps 1-based statement numbers to tags.
1003
+ """
1004
+ lines: List[str] = []
1005
+ flat: List[Tuple[str, str]] = []
1006
+ idx = 1
1007
+
1008
+ for g in groups:
1009
+ constraint_label = "pick EXACTLY ONE" if g.constraint == "exclusive" else "pick ALL that apply"
1010
+ group_header = f"--- {g.name.replace('_', ' ').upper()} ({constraint_label}) ---"
1011
+ lines.append(group_header)
1012
+ for tag, defn in g.tags:
1013
+ lines.append(f"{idx}. {defn}")
1014
+ flat.append((tag, defn))
1015
+ idx += 1
1016
+ lines.append("") # blank line between groups
1017
+
1018
+ return "\n".join(lines), flat
1019
+
1020
+
1021
+ STRUCTURAL_SYSTEM_TEMPLATE = """You classify image descriptions by selecting true statements from a numbered list.
1022
+
1023
+ The statements are organized into GROUPS. Each group header tells you how many to pick:
1024
+ - "pick EXACTLY ONE" = choose the single best match in that group
1025
+ - "pick ALL that apply" = choose every statement that is true
1026
+
1027
+ IMPORTANT RULES:
1028
+ 1. ONLY select a statement if the description directly says it or makes it very obvious.
1029
+ 2. Do NOT guess or assume things the description does not mention.
1030
+ 3. For body type: "anthro" means an ANIMAL with a human-shaped body (walks upright, has hands, but still has fur/tail/muzzle). "humanoid" means HUMAN or human-like with NO animal features. A wolf standing on two legs = anthro, NOT humanoid.
1031
+ 4. For gender: only select male/female/intersex when there is explicit textual evidence (such as gender words or pronouns). Do not infer gender from species, body shape, clothing, or style. If no reliable gender cue is present, do not select male/female/intersex; use ambiguous_gender instead.
1032
+ 5. For clothing state: READ CAREFULLY! "topless" = bare chest, wearing pants. "bottomless" = wearing shirt, no pants. If unsure, re-read the description.
1033
+ 6. If clothing is not mentioned, do NOT pick any clothing statement.
1034
+
1035
+ Return JSON ONLY:
1036
+ {{"selections": [{{"i": 1}}, {{"i": 5}}]}}
1037
+
1038
+ EXAMPLE:
1039
+ Description: "A muscular male wolf standing in a forest, wearing jeans, giving a thumbs up"
1040
+ Answer: {{"selections": [{{"i": 2}}, {{"i": 6}}, {{"i": 10}}, {{"i": 14}}]}}
1041
+ Why: One character = solo (2). Wolf standing upright with hands = anthro (6), NOT humanoid because it is a wolf. Male (10). Wearing jeans = clothed (14)."""
1042
+
1043
+ STRUCTURAL_USER_TEMPLATE = """Read this image description and select which statements are true.
1044
+
1045
+ IMAGE DESCRIPTION:
1046
+ {image_description}
1047
+
1048
+ STATEMENTS (pick by number):
1049
+ {statement_lines}"""
1050
+
1051
+
1052
+ class StructuralSelectionItem(BaseModel):
1053
+ i: int = Field(..., description="1-based index into the statement list.")
1054
+
1055
+
1056
+ class StructuralSelectionResponse(BaseModel):
1057
+ selections: List[StructuralSelectionItem] = Field(default_factory=list)
1058
+
1059
+
1060
+ def _build_structural_response_format() -> Dict[str, Any]:
1061
+ schema = {
1062
+ "type": "object",
1063
+ "properties": {
1064
+ "selections": {
1065
+ "type": "array",
1066
+ "items": {
1067
+ "type": "object",
1068
+ "properties": {
1069
+ "i": {"type": "integer"},
1070
+ },
1071
+ "required": ["i"],
1072
+ "additionalProperties": False,
1073
+ },
1074
+ }
1075
+ },
1076
+ "required": ["selections"],
1077
+ "additionalProperties": False,
1078
+ }
1079
+ return {
1080
+ "type": "json_schema",
1081
+ "json_schema": {
1082
+ "name": "structural_selection",
1083
+ "strict": True,
1084
+ "schema": schema,
1085
+ },
1086
+ }
1087
+
1088
+
1089
+ # Cache the loaded groups so we only read JSON files once per process.
1090
+ _cached_structural_groups: Optional[List[StructuralGroup]] = None
1091
+
1092
+
1093
+ def _get_structural_groups() -> List[StructuralGroup]:
1094
+ global _cached_structural_groups
1095
+ if _cached_structural_groups is None:
1096
+ _cached_structural_groups = _load_structural_groups()
1097
+ return _cached_structural_groups
1098
+
1099
+
1100
+ def llm_infer_structural_tags(
1101
+ query_text: str,
1102
+ log=None,
1103
+ *,
1104
+ temperature: float = 0.0,
1105
+ max_tokens: int = 512,
1106
+ retries: int = 2,
1107
+ ) -> List[str]:
1108
+ """Infer structural tags via LLM using group-based statement agreement.
1109
+
1110
+ Probes multiple semantic groups (character count, body type, gender,
1111
+ clothing state, visual elements) with definitions loaded from wiki data
1112
+ where available.
1113
+
1114
+ Returns a list of e621 tag strings (e.g. ["solo", "anthro", "male", "clothed"]).
1115
+ """
1116
+ if log:
1117
+ log("Stage3s (structural): inferring structural tags via group-based statement agreement")
1118
+
1119
+ groups = _get_structural_groups()
1120
+ statement_lines, flat_tags = _build_structural_prompt(groups)
1121
+ N = len(flat_tags)
1122
+
1123
+ response_format = _build_structural_response_format()
1124
+ llm = _get_llm(temperature=temperature, max_tokens=max_tokens,
1125
+ response_format=response_format)
1126
+ model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
1127
+
1128
+ parser = PydanticOutputParser(pydantic_object=StructuralSelectionResponse)
1129
+
1130
+ prompt = ChatPromptTemplate.from_messages(
1131
+ [
1132
+ ("system", STRUCTURAL_SYSTEM_TEMPLATE),
1133
+ ("human", STRUCTURAL_USER_TEMPLATE),
1134
+ ],
1135
+ template_format="f-string",
1136
+ )
1137
+ chain = prompt | llm | parser
1138
+
1139
+ if log:
1140
+ group_summary = ", ".join(f"{g.name}({len(g.tags)})" for g in groups)
1141
+ log(f"Stage3s: model={model_name} groups=[{group_summary}] total_statements={N}")
1142
+
1143
+ for att in range(retries + 1):
1144
+ try:
1145
+ parsed = chain.invoke({
1146
+ "N": N,
1147
+ "image_description": query_text,
1148
+ "statement_lines": statement_lines,
1149
+ })
1150
+
1151
+ if isinstance(parsed, BaseModel):
1152
+ parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
1153
+
1154
+ sels = parsed.get("selections", []) if isinstance(parsed, dict) else []
1155
+ chosen_tags: List[str] = []
1156
+ seen: Set[str] = set()
1157
+ for item in sels:
1158
+ idx = item.get("i") if isinstance(item, dict) else None
1159
+ if not isinstance(idx, int) or idx < 1 or idx > N:
1160
+ continue
1161
+ tag = flat_tags[idx - 1][0]
1162
+ if tag not in seen:
1163
+ chosen_tags.append(tag)
1164
+ seen.add(tag)
1165
+
1166
+ if log:
1167
+ tag_str = ", ".join(chosen_tags) if chosen_tags else "(none)"
1168
+ log(f"Stage3s: attempt {att+1} selected {len(chosen_tags)} tags: {tag_str}")
1169
+
1170
+ return chosen_tags
1171
+
1172
+ except Exception as e:
1173
+ if log:
1174
+ log(f"Stage3s: attempt {att+1} error: {e}")
1175
+
1176
+ if log:
1177
+ log(f"Stage3s: gave up after {retries+1} attempts")
1178
+ return []
1179
+
1180
+
1181
+ # ---------------------------------------------------------------------------
1182
+ # Stage 3p: Simplified high-precision probe tags
1183
+ # ---------------------------------------------------------------------------
1184
+ _cached_runtime_probe_tags: Optional[List[str]] = None
1185
+ _cached_runtime_probe_rows: Optional[List[Dict[str, str]]] = None
1186
+ _cached_runtime_probe_wiki_defs: Optional[Dict[str, str]] = None
1187
+
1188
+ _PROBE_GLOSSARY_FALLBACKS: Dict[str, str] = {
1189
+ "anthro": "Animal character with human-like body shape, usually upright with arms and hands.",
1190
+ "canid": "Member of dog-family species (wolves, foxes, dogs, coyotes).",
1191
+ "felid": "Member of cat-family species (cats, lions, tigers, leopards).",
1192
+ "solo": "Exactly one character is present in the image.",
1193
+ "duo": "Exactly two characters are present in the image.",
1194
+ "group": "Four or more characters are present in the image.",
1195
+ "<3": "Visible heart symbol in text or icon form.",
1196
+ }
1197
+
1198
+
1199
+ def _load_runtime_probe_rows(log=None) -> List[Dict[str, str]]:
1200
+ global _cached_runtime_probe_rows
1201
+ if _cached_runtime_probe_rows is not None:
1202
+ return _cached_runtime_probe_rows
1203
+
1204
+ csv_path = Path(__file__).resolve().parents[2] / "data" / "analysis" / "simplified_probe_tags.csv"
1205
+ rows: List[Dict[str, str]] = []
1206
+ if not csv_path.is_file():
1207
+ if log:
1208
+ log(f"Stage3p: probe CSV not found at {csv_path}; skipping probe step")
1209
+ _cached_runtime_probe_rows = rows
1210
+ return rows
1211
+
1212
+ try:
1213
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
1214
+ rows = list(csv.DictReader(f))
1215
+ except Exception as e:
1216
+ if log:
1217
+ log(f"Stage3p: failed reading probe CSV: {e}")
1218
+ rows = []
1219
+
1220
+ _cached_runtime_probe_rows = rows
1221
+ return rows
1222
+
1223
+
1224
+ def _load_runtime_probe_wiki_defs() -> Dict[str, str]:
1225
+ global _cached_runtime_probe_wiki_defs
1226
+ if _cached_runtime_probe_wiki_defs is not None:
1227
+ return _cached_runtime_probe_wiki_defs
1228
+
1229
+ data_dir = Path(__file__).resolve().parents[2] / "data"
1230
+ wiki_path = data_dir / "tag_wiki_defs.json"
1231
+ defs: Dict[str, str] = {}
1232
+ if wiki_path.is_file():
1233
+ try:
1234
+ with wiki_path.open("r", encoding="utf-8") as f:
1235
+ defs = _json.load(f)
1236
+ except Exception:
1237
+ defs = {}
1238
+ _cached_runtime_probe_wiki_defs = defs
1239
+ return defs
1240
+
1241
+
1242
+ def _load_runtime_probe_tags(log=None) -> List[str]:
1243
+ """Load runtime probe tags from analysis output.
1244
+
1245
+ Preference order:
1246
+ 1) selected_final=1 (reliability-gated list)
1247
+ 2) selected_initial=1 (fallback if reliability file not built)
1248
+ """
1249
+ global _cached_runtime_probe_tags
1250
+ if _cached_runtime_probe_tags is not None:
1251
+ return _cached_runtime_probe_tags
1252
+
1253
+ rows = _load_runtime_probe_rows(log=log)
1254
+ tags: List[str] = []
1255
+
1256
+ def _is_on(v: str) -> bool:
1257
+ return (v or "").strip() in {"1", "true", "True"}
1258
+
1259
+ final = [r.get("tag", "").strip() for r in rows if _is_on(r.get("selected_final", ""))]
1260
+ initial = [r.get("tag", "").strip() for r in rows if _is_on(r.get("selected_initial", ""))]
1261
+ tags = [t for t in (final if final else initial) if t]
1262
+
1263
+ _cached_runtime_probe_tags = tags
1264
+ if log and tags:
1265
+ log(f"Stage3p: loaded {len(tags)} probe tags")
1266
+ return tags
1267
+
1268
+
1269
+ def _is_real_wiki_def(text: str) -> bool:
1270
+ t = (text or "").strip()
1271
+ if not t:
1272
+ return False
1273
+ if t.lower().startswith("thumb "):
1274
+ return False
1275
+ return len(t) >= 20
1276
+
1277
+
1278
+ def _clean_glossary_text(text: str) -> str:
1279
+ t = " ".join((text or "").replace("\n", " ").replace("\r", " ").split())
1280
+ if len(t) > 160:
1281
+ t = t[:157].rstrip() + "..."
1282
+ return t
1283
+
1284
+
1285
+ def _build_probe_candidate_display(probe_tags: Sequence[str], log=None) -> Dict[str, str]:
1286
+ rows = _load_runtime_probe_rows(log=log)
1287
+ rows_by_tag = {r.get("tag", "").strip(): r for r in rows}
1288
+ wiki_defs = _load_runtime_probe_wiki_defs()
1289
+
1290
+ display: Dict[str, str] = {}
1291
+ for tag in probe_tags:
1292
+ base = _display_tag(tag)
1293
+ row = rows_by_tag.get(tag, {})
1294
+ needs_glossary = (row.get("needs_glossary", "") or "").strip() in {"1", "true", "True"}
1295
+ if not needs_glossary:
1296
+ display[tag] = base
1297
+ continue
1298
+
1299
+ raw_def = wiki_defs.get(tag, "")
1300
+ if not _is_real_wiki_def(raw_def):
1301
+ raw_def = _PROBE_GLOSSARY_FALLBACKS.get(tag, "")
1302
+ gloss = _clean_glossary_text(raw_def)
1303
+ display[tag] = f"{base} - {gloss}" if gloss else base
1304
+
1305
+ return display
1306
+
1307
+
1308
  def llm_infer_probe_tags(
1309
+ query_text: str,
1310
+ log=None,
1311
+ *,
1312
+ temperature: float = 0.0,
1313
+ max_tokens: int = 512,
1314
+ retries: int = 2,
1315
+ min_why: Optional[str] = None,
1316
  ) -> List[str]:
1317
+ """Infer probe tags from a fixed reliability-gated tag list."""
1318
+ probe_tags = _load_runtime_probe_tags(log=log)
1319
+ if not probe_tags:
1320
+ return []
1321
+
1322
  if log:
1323
+ log(f"Stage3p: probing {len(probe_tags)} tags")
1324
+ candidate_display = _build_probe_candidate_display(probe_tags, log=log)
1325
+
1326
+ out = llm_select_indices(
1327
+ query_text=query_text,
1328
+ candidates=probe_tags,
1329
+ max_pick=len(probe_tags),
1330
+ log=log,
1331
+ retries=retries,
1332
+ mode="single_shot",
1333
+ chunk_size=max(1, len(probe_tags)),
1334
+ per_phrase_k=max(1, len(probe_tags)),
1335
+ temperature=temperature,
1336
+ max_tokens=max_tokens,
1337
+ return_metadata=False,
1338
+ return_diagnostics=False,
1339
+ min_why=None,
1340
  candidate_display=candidate_display,
1341
  )
1342
+
1343
+ selected: List[str] = []
1344
+ for i in out:
1345
+ if 0 <= i < len(probe_tags):
1346
+ selected.append(probe_tags[i])
1347
+
1348
+ if log:
1349
+ shown = ", ".join(selected) if selected else "(none)"
1350
+ log(f"Stage3p: selected {len(selected)} probe tags: {shown}")
1351
+ return selected
psq_rag/retrieval/psq_retrieval.py CHANGED
@@ -146,7 +146,7 @@ def psq_candidates_from_rewrite_phrases(
146
  context_tag_weight: float = 1.0,
147
  context_weight: float = 0.5,
148
  per_phrase_k: int = 50,
149
- per_phrase_final_k: int = 10,
150
  global_k: int = 300,
151
  return_phrase_ranks: bool = False,
152
  verbose: bool = False,
 
146
  context_tag_weight: float = 1.0,
147
  context_weight: float = 0.5,
148
  per_phrase_k: int = 50,
149
+ per_phrase_final_k: int = 1,
150
  global_k: int = 300,
151
  return_phrase_ranks: bool = False,
152
  verbose: bool = False,
psq_rag/ui/group_ranked_display.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from dataclasses import dataclass
5
+ from functools import lru_cache
6
+ from pathlib import Path
7
+ from typing import Dict, List, Sequence, Tuple
8
+
9
+ import numpy as np
10
+
11
+ from psq_rag.retrieval.psq_retrieval import construct_pseudo_vector, _norm_tag_for_lookup
12
+ from psq_rag.retrieval.state import get_tfidf_components, get_tfidf_tag_vectors
13
+
14
+
15
+ @dataclass
16
+ class GroupRankingRow:
17
+ group_name: str
18
+ expected_count: float
19
+ tags: List[Tuple[str, float]]
20
+
21
+
22
+ @lru_cache(maxsize=1)
23
+ def _load_enabled_groups() -> Dict[str, List[str]]:
24
+ csv_path = Path("data/analysis/category_registry.csv")
25
+ groups: Dict[str, List[str]] = {}
26
+ if not csv_path.exists():
27
+ return groups
28
+
29
+ with csv_path.open("r", encoding="utf-8", newline="") as f:
30
+ reader = csv.DictReader(f)
31
+ for row in reader:
32
+ tag = (row.get("tag") or "").strip()
33
+ if not tag:
34
+ continue
35
+
36
+ enabled = str(row.get("category_enabled") or "").strip().lower() in {"1", "true", "yes"}
37
+ if not enabled:
38
+ continue
39
+
40
+ status = (row.get("category_status") or "").strip().lower()
41
+ if status == "excluded":
42
+ continue
43
+
44
+ group_name = (row.get("category_name") or "").strip()
45
+ if not group_name:
46
+ continue
47
+
48
+ groups.setdefault(group_name, []).append(_norm_tag_for_lookup(tag))
49
+
50
+ # Deduplicate per group, preserving order.
51
+ deduped: Dict[str, List[str]] = {}
52
+ for name, tags in groups.items():
53
+ seen = set()
54
+ out = []
55
+ for t in tags:
56
+ if t in seen:
57
+ continue
58
+ seen.add(t)
59
+ out.append(t)
60
+ if out:
61
+ deduped[name] = out
62
+ return deduped
63
+
64
+
65
+ def _calibrate_probabilities(scores: Dict[str, float]) -> Dict[str, float]:
66
+ if not scores:
67
+ return {}
68
+ vals = np.asarray(list(scores.values()), dtype=np.float32)
69
+ center = float(np.median(vals))
70
+ q25 = float(np.percentile(vals, 25))
71
+ q75 = float(np.percentile(vals, 75))
72
+ scale = q75 - q25
73
+ if scale <= 1e-6:
74
+ scale = float(np.std(vals))
75
+ if scale <= 1e-6:
76
+ scale = 1.0
77
+
78
+ probs: Dict[str, float] = {}
79
+ for tag, score in scores.items():
80
+ z = (float(score) - center) / scale
81
+ p = 1.0 / (1.0 + float(np.exp(-z)))
82
+ probs[tag] = p
83
+ return probs
84
+
85
+
86
+ def rank_groups_from_tfidf(
87
+ seed_terms: Sequence[str],
88
+ *,
89
+ top_groups: int,
90
+ top_tags_per_group: int,
91
+ group_rank_top_k: int,
92
+ ) -> List[GroupRankingRow]:
93
+ groups = _load_enabled_groups()
94
+ if not groups:
95
+ return []
96
+
97
+ components = get_tfidf_components()
98
+ tag_vectors = get_tfidf_tag_vectors()
99
+ idf = components["idf"]
100
+ term_to_col = components["tag_to_column_index"]
101
+ svd = components["svd_model"]
102
+ tag_to_row = tag_vectors["tag_to_row_index"]
103
+ mat_norm = tag_vectors["reduced_matrix_norm"]
104
+
105
+ pseudo_doc: Dict[str, float] = {}
106
+ for term in seed_terms:
107
+ key = _norm_tag_for_lookup(str(term))
108
+ if key in term_to_col:
109
+ pseudo_doc[key] = pseudo_doc.get(key, 0.0) + 1.0
110
+ if not pseudo_doc:
111
+ return []
112
+
113
+ pseudo_vec = construct_pseudo_vector(pseudo_doc, idf, term_to_col)
114
+ q = svd.transform(pseudo_vec).reshape(-1).astype(np.float32)
115
+ qn = float(np.linalg.norm(q))
116
+ if qn <= 0.0:
117
+ return []
118
+ q = q / qn
119
+
120
+ all_tags: List[str] = []
121
+ for tags in groups.values():
122
+ all_tags.extend(tags)
123
+ all_tags = list(dict.fromkeys(all_tags))
124
+
125
+ scored_tags: List[str] = []
126
+ rows: List[int] = []
127
+ for tag in all_tags:
128
+ idx = tag_to_row.get(tag)
129
+ if idx is None:
130
+ continue
131
+ scored_tags.append(tag)
132
+ rows.append(int(idx))
133
+ if not rows:
134
+ return []
135
+
136
+ sims = (mat_norm[np.asarray(rows, dtype=np.int32)] @ q).astype(np.float32)
137
+ score_by_tag: Dict[str, float] = {t: float(s) for t, s in zip(scored_tags, sims)}
138
+ prob_by_tag = _calibrate_probabilities(score_by_tag)
139
+
140
+ rows_out: List[GroupRankingRow] = []
141
+ rank_k = max(1, int(group_rank_top_k))
142
+ display_k = max(1, int(top_tags_per_group))
143
+
144
+ for group_name, tags in groups.items():
145
+ scored = [(t, prob_by_tag[t]) for t in tags if t in prob_by_tag]
146
+ if not scored:
147
+ continue
148
+ scored.sort(key=lambda x: x[1], reverse=True)
149
+ expected = float(sum(p for _, p in scored[:rank_k]))
150
+ rows_out.append(
151
+ GroupRankingRow(
152
+ group_name=group_name,
153
+ expected_count=expected,
154
+ tags=scored[:display_k],
155
+ )
156
+ )
157
+
158
+ rows_out.sort(key=lambda r: r.expected_count, reverse=True)
159
+ return rows_out[: max(1, int(top_groups))]
160
+
161
+
162
+ def _fmt_tag_cell(tag: str, p: float) -> str:
163
+ safe_tag = tag.replace("|", "\\|")
164
+ return f"`{safe_tag}` (p={p:.2f}, E={p:.2f})"
165
+
166
+
167
+ def render_group_rankings_markdown(
168
+ seed_terms: Sequence[str],
169
+ *,
170
+ top_groups: int,
171
+ top_tags_per_group: int,
172
+ group_rank_top_k: int,
173
+ ) -> str:
174
+ rows = rank_groups_from_tfidf(
175
+ seed_terms,
176
+ top_groups=top_groups,
177
+ top_tags_per_group=top_tags_per_group,
178
+ group_rank_top_k=group_rank_top_k,
179
+ )
180
+ if not rows:
181
+ return "No ranked group display available (insufficient TF-IDF context)."
182
+
183
+ k = max(1, int(top_tags_per_group))
184
+ headers = ["Group/Category", f"Expected Tags (top {max(1, int(group_rank_top_k))})"]
185
+ headers.extend([f"Tag {i}" for i in range(1, k + 1)])
186
+ table = [
187
+ "| " + " | ".join(headers) + " |",
188
+ "| " + " | ".join(["---"] * len(headers)) + " |",
189
+ ]
190
+
191
+ for row in rows:
192
+ cells = [row.group_name, f"{row.expected_count:.2f}"]
193
+ tag_cells = [_fmt_tag_cell(tag, p) for tag, p in row.tags]
194
+ if len(tag_cells) < k:
195
+ tag_cells.extend([""] * (k - len(tag_cells)))
196
+ cells.extend(tag_cells)
197
+ table.append("| " + " | ".join(cells) + " |")
198
+ return "\n".join(table)
scripts/eval_pipeline.py CHANGED
@@ -582,7 +582,7 @@ def run_eval(
582
  mode: str = "chunked_map_union",
583
  chunk_size: int = 60,
584
  per_phrase_k: int = 2,
585
- per_phrase_final_k: int = 10,
586
  temperature: float = 0.0,
587
  max_tokens: int = 512,
588
  verbose: bool = False,
@@ -982,7 +982,7 @@ def main(argv=None) -> int:
982
  choices=["single_shot", "chunked_map_union"])
983
  ap.add_argument("--chunk-size", type=int, default=60)
984
  ap.add_argument("--per-phrase-k", type=int, default=2)
985
- ap.add_argument("--per-phrase-final-k", type=int, default=10,
986
  help="Top-K candidates per phrase after scoring (retrieval cap)")
987
  ap.add_argument("--temperature", type=float, default=0.0)
988
  ap.add_argument("--max-tokens", type=int, default=512)
 
582
  mode: str = "chunked_map_union",
583
  chunk_size: int = 60,
584
  per_phrase_k: int = 2,
585
+ per_phrase_final_k: int = 1,
586
  temperature: float = 0.0,
587
  max_tokens: int = 512,
588
  verbose: bool = False,
 
982
  choices=["single_shot", "chunked_map_union"])
983
  ap.add_argument("--chunk-size", type=int, default=60)
984
  ap.add_argument("--per-phrase-k", type=int, default=2)
985
+ ap.add_argument("--per-phrase-final-k", type=int, default=1,
986
  help="Top-K candidates per phrase after scoring (retrieval cap)")
987
  ap.add_argument("--temperature", type=float, default=0.0)
988
  ap.add_argument("--max-tokens", type=int, default=512)