Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 13

Commit

684cf99

1 Parent(s): c14936f

Redesign structural inference as group-based system with wiki data

- Organize structural tags into semantic groups (character count, body type,
gender, clothing state, visual elements) with explicit constraints
- Load definitions from tag_wiki_defs.json where text exists, fall back to
curated definitions for thumbnail-only wiki entries
- Add clothing state group (clothed/nude/topless/bottomless) and visual
elements group (looking_at_viewer/text) to address top misses
- Improve anthro vs humanoid distinction with clearer definitions and example
- Add taur to body type group
- Fix extract_wiki_data.py: filter "top" navigation artifacts, skip
thumbnail-only definitions, deduplicate group members
- Update analyze_compact_eval.py structural tag set for new groups

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (3) hide show

psq_rag/llm/select.py +195 -46
scripts/analyze_compact_eval.py +12 -1
scripts/extract_wiki_data.py +23 -3

psq_rag/llm/select.py CHANGED Viewed

@@ -12,6 +12,7 @@
 import os
 import re
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast, Literal
 from langchain_openai import ChatOpenAI
@@ -763,47 +764,184 @@ def llm_select_indices(
 # ---------------------------------------------------------------------------
-# Stage 3s: Structural tag inference (solo/duo/male/female/anthro/biped …)
 # ---------------------------------------------------------------------------
-# Each statement maps to exactly one tag.  The LLM picks statement numbers.
-_STRUCTURAL_STATEMENTS: List[Tuple[str, str]] = [
-    # Character count — exactly one should be picked
-    ("No characters or living beings appear in the image", "zero_pictured"),
-    ("There is exactly one character in the image", "solo"),
-    ("There are exactly two characters in the image", "duo"),
-    ("There are exactly three characters in the image", "trio"),
-    ("There are four or more characters in the image", "group"),
-    # Body plan — pick all that apply across characters
-    ("A character is a normal animal walking on all fours, not humanized", "feral"),
-    ("A character is an animal with a human-like body (standing upright on two legs, with hands)", "anthro"),
-    ("A character is a human or looks fully human", "humanoid"),
-    # Gender — pick all that apply across characters
-    ("A male character is shown", "male"),
-    ("A female character is shown", "female"),
-    ("A character's gender cannot be determined from the description", "ambiguous_gender"),
-    ("An intersex or hermaphrodite character is shown", "intersex"),
-]
-STRUCTURAL_SYSTEM_TEMPLATE = """You classify image descriptions. You will read a description of an image, then select which numbered statements are true about it.
 IMPORTANT RULES:
 1. ONLY select a statement if the description directly says it or makes it very obvious.
-2. Do NOT guess or assume anything the description does not say.
-3. Select exactly ONE statement from the character count group (statements about how many characters there are).
-4. Select ALL statements that apply from the body type and gender groups.
-5. If the description does not mention gender at all, select the "gender cannot be determined" statement.
-Return JSON matching this exact format — nothing else:
 {{"selections": [{{"i": 1}}, {{"i": 5}}]}}
-where each "i" is a statement number from 1 to {N}.
 EXAMPLE:
-Description: "A muscular male wolf standing in a forest, giving a thumbs up"
-Statements: 1. No characters  2. Exactly one character  3. Exactly two  4. Exactly three  5. Four or more  6. Normal animal on all fours  7. Animal with human-like body  8. Human  9. Male shown  10. Female shown  11. Gender unknown  12. Intersex shown
-Correct answer: {{"selections": [{{"i": 2}}, {{"i": 7}}, {{"i": 9}}]}}
-Reasoning: One character (2), wolf standing upright with hands giving thumbs up = animal with human body (7), described as male (9)."""
 STRUCTURAL_USER_TEMPLATE = """Read this image description and select which statements are true.
@@ -851,29 +989,39 @@ def _build_structural_response_format() -> Dict[str, Any]:
     }
 def llm_infer_structural_tags(
     query_text: str,
     log=None,
     *,
     temperature: float = 0.0,
-    max_tokens: int = 256,
     retries: int = 2,
 ) -> List[str]:
-    """Infer structural tags (solo/duo/male/female/anthro/biped/…) via LLM.
-    Instead of retrieving these from a candidate list, we ask the LLM to agree
-    with natural-language statements about the image.  This handles tags that
-    are almost never stated in captions but are visually/structurally obvious.
-    Returns a list of e621 tag strings (e.g. ["solo", "anthro", "male", "biped"]).
     """
     if log:
-        log("Stage3s (structural): inferring structural tags via statement agreement")
-    statements = _STRUCTURAL_STATEMENTS
-    lines = [f"{j}. {stmt}" for j, (stmt, _tag) in enumerate(statements, 1)]
-    statement_lines = "\n".join(lines)
-    N = len(statements)
     response_format = _build_structural_response_format()
     llm = _get_llm(temperature=temperature, max_tokens=max_tokens,
@@ -892,7 +1040,8 @@ def llm_infer_structural_tags(
     chain = prompt | llm | parser
     if log:
-        log(f"Stage3s: model={model_name} statements={N}")
     for att in range(retries + 1):
         try:
@@ -907,12 +1056,12 @@ def llm_infer_structural_tags(
             sels = parsed.get("selections", []) if isinstance(parsed, dict) else []
             chosen_tags: List[str] = []
-            seen = set()
             for item in sels:
                 idx = item.get("i") if isinstance(item, dict) else None
                 if not isinstance(idx, int) or idx < 1 or idx > N:
                     continue
-                tag = statements[idx - 1][1]
                 if tag not in seen:
                     chosen_tags.append(tag)
                     seen.add(tag)

 import os
 import re
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast, Literal
 from langchain_openai import ChatOpenAI
 # ---------------------------------------------------------------------------
+# Stage 3s: Structural tag inference (solo/duo/male/female/anthro/… )
 # ---------------------------------------------------------------------------
+# Group-based approach: tags are organized into semantic groups loaded from
+# tag_groups.json / tag_wiki_defs.json where possible, with curated fallback
+# definitions for tags whose wiki entries are only thumbnail references.
+#
+# Each group specifies a constraint mode:
+#   "exclusive" = pick exactly one (e.g. character count)
+#   "multi"     = pick all that apply (e.g. body type, gender)
+import json as _json
+@dataclass
+class StructuralGroup:
+    """One category of structural tags to probe."""
+    name: str
+    constraint: str  # "exclusive" or "multi"
+    tags: List[Tuple[str, str]]  # (tag, definition) pairs
+def _load_structural_groups() -> List[StructuralGroup]:
+    """Build structural groups from curated config + data files.
+    Uses tag_groups.json for membership and tag_wiki_defs.json for definitions
+    where text definitions exist; falls back to curated definitions otherwise.
+    """
+    data_dir = Path(__file__).resolve().parents[2] / "data"
+    # Load wiki definitions (may not exist yet)
+    wiki_defs: Dict[str, str] = {}
+    wiki_path = data_dir / "tag_wiki_defs.json"
+    if wiki_path.is_file():
+        with wiki_path.open("r", encoding="utf-8") as f:
+            wiki_defs = _json.load(f)
+    def _def(tag: str, fallback: str) -> str:
+        """Get wiki definition if it's real text, otherwise use fallback."""
+        d = wiki_defs.get(tag, "")
+        # Skip thumbnail-only definitions
+        if not d or d.startswith("thumb ") or len(d) < 15:
+            return fallback
+        return d[:200]  # cap length for prompt
+    groups: List[StructuralGroup] = []
+    # ── Group A: Character Count (exclusive) ──
+    groups.append(StructuralGroup(
+        name="character_count",
+        constraint="exclusive",
+        tags=[
+            ("zero_pictured", _def("zero_pictured",
+                "No characters or living beings appear in the image")),
+            ("solo", _def("solo",
+                "Exactly one character appears in the image")),
+            ("duo", _def("duo",
+                "Exactly two characters appear in the image")),
+            ("trio", _def("trio",
+                "Exactly three characters appear in the image")),
+            ("group", _def("group",
+                "Four or more characters appear in the image")),
+        ],
+    ))
+    # ── Group B: Body Type (multi — per character) ──
+    # Key distinction the LLM must learn:
+    #   anthro = ANIMAL with human body shape (upright, hands)
+    #   humanoid = HUMAN or near-human (elf, dwarf) with NO animal features
+    #   feral = normal animal shape, on all fours
+    groups.append(StructuralGroup(
+        name="body_type",
+        constraint="multi",
+        tags=[
+            ("anthro", _def("anthro",
+                "An animal character with a human-like body: walks upright on two legs, "
+                "has arms and hands. Examples: a wolf-person, a fox standing up. "
+                "Still has animal features like fur, tail, muzzle")),
+            ("feral", _def("feral",
+                "A regular animal in its natural body shape. Walks on all fours (or "
+                "flies/swims naturally). NOT standing upright, NOT humanized")),
+            ("humanoid", _def("humanoid",
+                "A human or human-like character with NO animal features. Includes "
+                "humans, elves, dwarves, and fantasy races that look human. "
+                "Does NOT include animal-people — those are anthro")),
+            ("taur", _def("taur",
+                "A centaur-like body: human or anthro upper body attached to a "
+                "four-legged animal lower body")),
+        ],
+    ))
+    # ── Group C: Gender (multi — per character) ──
+    groups.append(StructuralGroup(
+        name="gender",
+        constraint="multi",
+        tags=[
+            ("male", _def("male",
+                "A character described as male, a boy, or with he/him pronouns")),
+            ("female", _def("female",
+                "A character described as female, a girl, or with she/her pronouns")),
+            ("ambiguous_gender", _def("ambiguous_gender",
+                "A character whose gender is not stated or cannot be determined")),
+            ("intersex", _def("intersex",
+                "A character explicitly described as intersex or hermaphrodite")),
+        ],
+    ))
+    # ── Group D: Clothing State (multi) ──
+    groups.append(StructuralGroup(
+        name="clothing_state",
+        constraint="multi",
+        tags=[
+            ("clothed", _def("clothed",
+                "A character is wearing clothes on both upper and lower body")),
+            ("nude", _def("nude",
+                "A character is wearing no clothes at all")),
+            ("topless", _def("topless",
+                "A character's upper body is uncovered but lower body has clothing")),
+            ("bottomless", _def("bottomless",
+                "A character wears clothing on upper body but lower body is uncovered")),
+        ],
+    ))
+    # ── Group E: Common Visual Elements (multi) ──
+    groups.append(StructuralGroup(
+        name="visual_elements",
+        constraint="multi",
+        tags=[
+            ("looking_at_viewer", _def("looking_at_viewer",
+                "A character is looking directly at the camera or viewer")),
+            ("text", _def("text",
+                "The image contains visible writing, words, or lettering")),
+        ],
+    ))
+    return groups
+def _build_structural_prompt(groups: List[StructuralGroup]) -> Tuple[str, List[Tuple[str, str]]]:
+    """Build numbered statement list from structural groups.
+    Returns (formatted_text, flat_list_of_(tag, definition)_pairs).
+    The flat list maps 1-based statement numbers to tags.
+    """
+    lines: List[str] = []
+    flat: List[Tuple[str, str]] = []
+    idx = 1
+    for g in groups:
+        constraint_label = "pick EXACTLY ONE" if g.constraint == "exclusive" else "pick ALL that apply"
+        group_header = f"--- {g.name.replace('_', ' ').upper()} ({constraint_label}) ---"
+        lines.append(group_header)
+        for tag, defn in g.tags:
+            lines.append(f"{idx}. {defn}")
+            flat.append((tag, defn))
+            idx += 1
+        lines.append("")  # blank line between groups
+    return "\n".join(lines), flat
+STRUCTURAL_SYSTEM_TEMPLATE = """You classify image descriptions by selecting true statements from a numbered list.
+The statements are organized into GROUPS. Each group header tells you how many to pick:
+- "pick EXACTLY ONE" = choose the single best match in that group
+- "pick ALL that apply" = choose every statement that is true
 IMPORTANT RULES:
 1. ONLY select a statement if the description directly says it or makes it very obvious.
+2. Do NOT guess or assume things the description does not mention.
+3. For body type: "anthro" means an ANIMAL with a human-shaped body (walks upright, has hands, but still has fur/tail/muzzle). "humanoid" means HUMAN or human-like with NO animal features. A wolf standing on two legs = anthro, NOT humanoid.
+4. If the description never mentions gender, pick "gender cannot be determined".
+5. If clothing is not mentioned, do NOT pick any clothing statement.
+Return JSON ONLY:
 {{"selections": [{{"i": 1}}, {{"i": 5}}]}}
 EXAMPLE:
+Description: "A muscular male wolf standing in a forest, wearing jeans, giving a thumbs up"
+Answer: {{"selections": [{{"i": 2}}, {{"i": 6}}, {{"i": 10}}, {{"i": 14}}]}}
+Why: One character = solo (2). Wolf standing upright with hands = anthro (6), NOT humanoid because it is a wolf. Male (10). Wearing jeans = clothed (14)."""
 STRUCTURAL_USER_TEMPLATE = """Read this image description and select which statements are true.
     }
+# Cache the loaded groups so we only read JSON files once per process.
+_cached_structural_groups: Optional[List[StructuralGroup]] = None
+def _get_structural_groups() -> List[StructuralGroup]:
+    global _cached_structural_groups
+    if _cached_structural_groups is None:
+        _cached_structural_groups = _load_structural_groups()
+    return _cached_structural_groups
 def llm_infer_structural_tags(
     query_text: str,
     log=None,
     *,
     temperature: float = 0.0,
+    max_tokens: int = 512,
     retries: int = 2,
 ) -> List[str]:
+    """Infer structural tags via LLM using group-based statement agreement.
+    Probes multiple semantic groups (character count, body type, gender,
+    clothing state, visual elements) with definitions loaded from wiki data
+    where available.
+    Returns a list of e621 tag strings (e.g. ["solo", "anthro", "male", "clothed"]).
     """
     if log:
+        log("Stage3s (structural): inferring structural tags via group-based statement agreement")
+    groups = _get_structural_groups()
+    statement_lines, flat_tags = _build_structural_prompt(groups)
+    N = len(flat_tags)
     response_format = _build_structural_response_format()
     llm = _get_llm(temperature=temperature, max_tokens=max_tokens,
     chain = prompt | llm | parser
     if log:
+        group_summary = ", ".join(f"{g.name}({len(g.tags)})" for g in groups)
+        log(f"Stage3s: model={model_name} groups=[{group_summary}] total_statements={N}")
     for att in range(retries + 1):
         try:
             sels = parsed.get("selections", []) if isinstance(parsed, dict) else []
             chosen_tags: List[str] = []
+            seen: Set[str] = set()
             for item in sels:
                 idx = item.get("i") if isinstance(item, dict) else None
                 if not isinstance(idx, int) or idx < 1 or idx > N:
                     continue
+                tag = flat_tags[idx - 1][0]
                 if tag not in seen:
                     chosen_tags.append(tag)
                     seen.add(tag)

scripts/analyze_compact_eval.py CHANGED Viewed

@@ -53,7 +53,18 @@ _TAXONOMY = frozenset({"mammal","canid","canine","canis","felid","feline","felis
 _BODY_PLAN = frozenset({"anthro","feral","biped","quadruped","taur","humanoid","semi-anthro","animatronic","robot","machine","plushie","kemono"})
 _POSE = frozenset({"solo","duo","group","trio","standing","sitting","lying","running","walking","flying","swimming","crouching","kneeling","jumping","looking_at_viewer","looking_away","looking_back","looking_up","looking_down","looking_aside","front_view","side_view","back_view","three-quarter_view","from_above","from_below","close-up","portrait","full-length_portrait","hand_on_hip","arms_crossed","all_fours","on_back","on_side","crossed_arms"})
 _COUNT_RE = re.compile(r"^\d+_(fingers|toes|horns|arms|legs|eyes|ears|wings|tails)")
-_STRUCTURAL = frozenset({"solo","duo","trio","group","zero_pictured","anthro","feral","humanoid","biped","quadruped","male","female","ambiguous_gender","intersex"})
 def categorize(tag, tag_type):
     tid = tag_type.get(tag, -1)

 _BODY_PLAN = frozenset({"anthro","feral","biped","quadruped","taur","humanoid","semi-anthro","animatronic","robot","machine","plushie","kemono"})
 _POSE = frozenset({"solo","duo","group","trio","standing","sitting","lying","running","walking","flying","swimming","crouching","kneeling","jumping","looking_at_viewer","looking_away","looking_back","looking_up","looking_down","looking_aside","front_view","side_view","back_view","three-quarter_view","from_above","from_below","close-up","portrait","full-length_portrait","hand_on_hip","arms_crossed","all_fours","on_back","on_side","crossed_arms"})
 _COUNT_RE = re.compile(r"^\d+_(fingers|toes|horns|arms|legs|eyes|ears|wings|tails)")
+_STRUCTURAL = frozenset({
+    # Character count
+    "solo","duo","trio","group","zero_pictured",
+    # Body type
+    "anthro","feral","humanoid","taur",
+    # Gender
+    "male","female","ambiguous_gender","intersex",
+    # Clothing state
+    "clothed","nude","topless","bottomless",
+    # Visual elements
+    "looking_at_viewer","text",
+})
 def categorize(tag, tag_type):
     tid = tag_type.get(tag, -1)

scripts/extract_wiki_data.py CHANGED Viewed

@@ -24,16 +24,27 @@ def _extract_tag_links(body: str) -> List[str]:
     - * [[tagname|display]] — list items
     """
     tags = []
     # Anchor links: [[#tag_name|display_text]]
     for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
-        tags.append(m.group(1))
     # If no anchor links found, try regular wiki links in list items
     if not tags:
         for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
             tag = m.group(1)
-            if not tag.startswith('tag_group:') and not tag.startswith('tag '):
                 tags.append(tag)
-    return tags
 def _first_sentence(body: str) -> str:
@@ -54,6 +65,15 @@ def _first_sentence(body: str) -> str:
             continue
         if len(line) < 10:
             continue
         # Truncate at first period if it's a real sentence
         period = line.find('. ')
         if period > 20:

     - * [[tagname|display]] — list items
     """
     tags = []
+    # Navigation/heading anchors to skip
+    _SKIP = {"top", "see_also", "related", "back", "contents", "toc"}
     # Anchor links: [[#tag_name|display_text]]
     for m in re.finditer(r'\[\[#([a-z0-9_]+)\|', body):
+        tag = m.group(1)
+        if tag not in _SKIP:
+            tags.append(tag)
     # If no anchor links found, try regular wiki links in list items
     if not tags:
         for m in re.finditer(r'\*\s*\[\[([a-z0-9_()]+?)(?:\||\]\])', body):
             tag = m.group(1)
+            if tag not in _SKIP and not tag.startswith('tag_group:') and not tag.startswith('tag '):
                 tags.append(tag)
+    # Deduplicate while preserving order
+    seen = set()
+    deduped = []
+    for t in tags:
+        if t not in seen:
+            seen.add(t)
+            deduped.append(t)
+    return deduped
 def _first_sentence(body: str) -> str:
             continue
         if len(line) < 10:
             continue
+        # Skip lines that are just thumbnail references (e.g. "thumb #12345 thumb #67890")
+        if re.fullmatch(r'(thumb\s*#\d+\s*)+', line):
+            continue
+        # Skip lines that are mostly thumbnail references with little text
+        thumb_stripped = re.sub(r'thumb\s*#\d+', '', line).strip()
+        if len(thumb_stripped) < 10:
+            continue
+        # Use the thumb-stripped version for the definition
+        line = thumb_stripped
         # Truncate at first period if it's a real sentence
         period = line.find('. ')
         if period > 20: