Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Claude commited on Feb 10

Commit

09a248d

1 Parent(s): 962e2b4

Add --min-why threshold to filter Stage 3 selections by confidence level

select.py: WHY_RANK ordinal mapping, min_why parameter filters the best{}
dict before output. E.g. min_why="explicit" keeps only explicitly matched
tags, min_why="strong_implied" keeps explicit + strong_implied.

eval_pipeline.py: --min-why CLI arg threaded through to llm_select_indices.

https://claude.ai/code/session_019PY5TEXTWGtToUbowunSRG

Files changed (2) hide show

psq_rag/llm/select.py +23 -0
scripts/eval_pipeline.py +9 -1

psq_rag/llm/select.py CHANGED Viewed

@@ -39,6 +39,15 @@ _GENERIC_CHARACTER_TAGS = frozenset({
 WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
 # Deterministic mapping: ordinal "why" -> numeric score for ordering/debug.
 WHY_TO_SCORE: Dict[str, float] = {
     "explicit": 0.90,
@@ -484,9 +493,14 @@ def llm_select_indices(
     temperature: float = 0.0,
     max_tokens: int = 512,
     return_metadata: bool = False,
 ) -> Union[List[int], Tuple[List[int], Dict[str, str]]]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
     This implementation uses LangChain ONLY.
     NOTE: query_text is treated as the image description (original prompt).
@@ -716,6 +730,15 @@ def llm_select_indices(
                         ENTITY_SYSTEM_TEMPLATE
                     )
     # Deterministic ordering: derived score desc, tie-break by count desc (count not shown to LLM).
     count_by_tag = {c.tag: (c.count if c.count is not None else -1) for c in norm}
     ordered_tags = sorted(best.keys(), key=lambda t: (best[t][0], count_by_tag.get(t, -1)), reverse=True)

 WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
+# Ordinal rank: lower = more confident.  Used for threshold filtering.
+WHY_RANK: Dict[str, int] = {
+    "explicit": 0,
+    "strong_implied": 1,
+    "weak_implied": 2,
+    "style_or_meta": 3,
+    "other": 4,
+}
 # Deterministic mapping: ordinal "why" -> numeric score for ordering/debug.
 WHY_TO_SCORE: Dict[str, float] = {
     "explicit": 0.90,
     temperature: float = 0.0,
     max_tokens: int = 512,
     return_metadata: bool = False,
+    min_why: Optional[str] = None,
 ) -> Union[List[int], Tuple[List[int], Dict[str, str]]]:
     """Return indices into the ORIGINAL candidates list (legacy interface).
+    min_why: if set, only keep tags whose 'why' is at or above this confidence
+             level.  E.g. min_why="explicit" keeps only explicit matches;
+             min_why="strong_implied" keeps explicit + strong_implied.
     This implementation uses LangChain ONLY.
     NOTE: query_text is treated as the image description (original prompt).
                         ENTITY_SYSTEM_TEMPLATE
                     )
+    # Apply why threshold: drop tags below the minimum confidence level.
+    if min_why is not None:
+        max_rank = WHY_RANK.get(min_why, 4)
+        before = len(best)
+        best = {t: v for t, v in best.items() if WHY_RANK.get(v[1], 4) <= max_rank}
+        if log:
+            log(f"Stage3 why filter: min_why={min_why} (rank<={max_rank}), "
+                f"before={before} after={len(best)} dropped={before - len(best)}")
     # Deterministic ordering: derived score desc, tie-break by count desc (count not shown to LLM).
     count_by_tag = {c.tag: (c.count if c.count is not None else -1) for c in norm}
     ordered_tags = sorted(best.keys(), key=lambda t: (best[t][0], count_by_tag.get(t, -1)), reverse=True)

scripts/eval_pipeline.py CHANGED Viewed

@@ -170,6 +170,7 @@ def _process_one_sample(
     max_tokens: int,
     verbose: bool,
     print_lock: threading.Lock,
 ) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
@@ -250,6 +251,7 @@ def _process_one_sample(
             temperature=temperature,
             max_tokens=max_tokens,
             return_metadata=True,
         )
         result.stage3_time = time.time() - t0
@@ -351,6 +353,7 @@ def run_eval(
     shuffle: bool = True,
     seed: int = 42,
     workers: int = 1,
 ) -> List[SampleResult]:
     # Load eval samples
@@ -400,7 +403,7 @@ def run_eval(
                 sample, i, total,
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, temperature, max_tokens, verbose,
-                print_lock,
             )
             results.append(result)
     else:
@@ -647,6 +650,9 @@ def main(argv=None) -> int:
                     help="Random seed for shuffle (default: 42)")
     ap.add_argument("--workers", "-w", type=int, default=4,
                     help="Number of parallel workers (default: 4, use 1 for sequential)")
     args = ap.parse_args(list(argv) if argv is not None else None)
@@ -664,6 +670,7 @@ def main(argv=None) -> int:
         shuffle=args.shuffle,
         seed=args.seed,
         workers=args.workers,
     )
     print_summary(results)
@@ -694,6 +701,7 @@ def main(argv=None) -> int:
         "shuffle": args.shuffle,
         "seed": args.seed,
         "workers": args.workers,
         "n_errors": sum(1 for r in results if r.error),
     }

     max_tokens: int,
     verbose: bool,
     print_lock: threading.Lock,
+    min_why: Optional[str] = None,
 ) -> SampleResult:
     """Process a single eval sample through the full pipeline. Thread-safe."""
     from psq_rag.llm.rewrite import llm_rewrite_prompt
             temperature=temperature,
             max_tokens=max_tokens,
             return_metadata=True,
+            min_why=min_why,
         )
         result.stage3_time = time.time() - t0
     shuffle: bool = True,
     seed: int = 42,
     workers: int = 1,
+    min_why: Optional[str] = None,
 ) -> List[SampleResult]:
     # Load eval samples
                 sample, i, total,
                 skip_rewrite, allow_nsfw, mode, chunk_size,
                 per_phrase_k, temperature, max_tokens, verbose,
+                print_lock, min_why,
             )
             results.append(result)
     else:
                     help="Random seed for shuffle (default: 42)")
     ap.add_argument("--workers", "-w", type=int, default=4,
                     help="Number of parallel workers (default: 4, use 1 for sequential)")
+    ap.add_argument("--min-why", default=None,
+                    choices=["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"],
+                    help="Minimum 'why' confidence to keep (e.g. 'explicit' keeps only explicit matches)")
     args = ap.parse_args(list(argv) if argv is not None else None)
         shuffle=args.shuffle,
         seed=args.seed,
         workers=args.workers,
+        min_why=args.min_why,
     )
     print_summary(results)
         "shuffle": args.shuffle,
         "seed": args.seed,
         "workers": args.workers,
+        "min_why": args.min_why,
         "n_errors": sum(1 for r in results if r.error),
     }