Spaces:
Running
Running
Food Desert commited on
Commit ·
5c070ef
1
Parent(s): 3127166
Update pipeline diagram and classifier context rescoring
Browse files- app.py +40 -28
- docs/retrieval_contract.md +2 -2
- docs/rewrite_contract.md +1 -1
- docs/space_overview.md +10 -10
- scripts/eval_pipeline.py +0 -1
app.py
CHANGED
|
@@ -385,7 +385,7 @@ def _build_arch_diagram_html() -> str:
|
|
| 385 |
return """
|
| 386 |
<div class="arch-diagram-wrap">
|
| 387 |
<h2>Architecture At A Glance</h2>
|
| 388 |
-
<svg class="arch-flow" viewBox="0 0 1200
|
| 389 |
<defs>
|
| 390 |
<marker id="arch-arrowhead" markerWidth="10" markerHeight="8" refX="9" refY="4" orient="auto" markerUnits="strokeWidth">
|
| 391 |
<path d="M0,0 L10,4 L0,8 Z" fill="#334155" />
|
|
@@ -436,37 +436,43 @@ def _build_arch_diagram_html() -> str:
|
|
| 436 |
<path d="M600 174 C652 210, 700 224, 735 280" class="arch-soft-line" />
|
| 437 |
<path d="M600 174 C760 202, 916 210, 995 280" class="arch-soft-line" />
|
| 438 |
|
| 439 |
-
<rect x="
|
| 440 |
-
<text x="
|
| 441 |
-
<text x="
|
| 442 |
-
|
| 443 |
-
<rect x="
|
| 444 |
-
<text x="
|
| 445 |
-
<text x="
|
| 446 |
-
|
| 447 |
-
<
|
| 448 |
-
<
|
| 449 |
-
<
|
| 450 |
-
|
| 451 |
-
<path d="
|
| 452 |
-
<path d="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 453 |
<text x="990" y="455" text-anchor="middle" class="arch-small">top candidates</text>
|
| 454 |
|
| 455 |
-
<rect x="
|
| 456 |
-
<text x="
|
| 457 |
-
<text x="
|
| 458 |
|
| 459 |
-
<rect x="
|
| 460 |
-
<text x="
|
| 461 |
-
<text x="
|
| 462 |
|
| 463 |
-
<path d="
|
| 464 |
-
<path d="M735 352
|
| 465 |
-
<path d="M995 352
|
| 466 |
-
<path d="
|
| 467 |
|
| 468 |
-
<text x="
|
| 469 |
-
<text x="
|
| 470 |
</svg>
|
| 471 |
</div>
|
| 472 |
"""
|
|
@@ -3122,7 +3128,13 @@ def rag_pipeline_ui(
|
|
| 3122 |
log("Step 2: Prompt Squirrel retrieval (hidden)")
|
| 3123 |
try:
|
| 3124 |
t0 = time.perf_counter()
|
| 3125 |
-
retrieval_context_tags = list(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3126 |
rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
|
| 3127 |
retrieval_result = psq_candidates_from_rewrite_phrases(
|
| 3128 |
rewrite_phrases=rewrite_phrases,
|
|
|
|
| 385 |
return """
|
| 386 |
<div class="arch-diagram-wrap">
|
| 387 |
<h2>Architecture At A Glance</h2>
|
| 388 |
+
<svg class="arch-flow" viewBox="0 0 1200 940" role="img" aria-label="Prompt Squirrel pipeline diagram showing query reformulation, lexical matching, scene composition, tag classification, semantic retrieval, context rescoring, candidate ranking, and editable output">
|
| 389 |
<defs>
|
| 390 |
<marker id="arch-arrowhead" markerWidth="10" markerHeight="8" refX="9" refY="4" orient="auto" markerUnits="strokeWidth">
|
| 391 |
<path d="M0,0 L10,4 L0,8 Z" fill="#334155" />
|
|
|
|
| 436 |
<path d="M600 174 C652 210, 700 224, 735 280" class="arch-soft-line" />
|
| 437 |
<path d="M600 174 C760 202, 916 210, 995 280" class="arch-soft-line" />
|
| 438 |
|
| 439 |
+
<rect x="80" y="480" width="270" height="76" class="arch-box arch-retrieval" />
|
| 440 |
+
<text x="215" y="510" text-anchor="middle" class="arch-title">Semantic Retrieval</text>
|
| 441 |
+
<text x="215" y="533" text-anchor="middle" class="arch-small">FastText/HNSW candidates</text>
|
| 442 |
+
|
| 443 |
+
<rect x="465" y="480" width="270" height="76" class="arch-box arch-retrieval" />
|
| 444 |
+
<text x="600" y="510" text-anchor="middle" class="arch-title">Context Rescoring</text>
|
| 445 |
+
<text x="600" y="533" text-anchor="middle" class="arch-small">TF-IDF/SVD context</text>
|
| 446 |
+
|
| 447 |
+
<rect x="850" y="480" width="270" height="76" class="arch-box arch-llm" />
|
| 448 |
+
<text x="985" y="510" text-anchor="middle" class="arch-title">Candidate Ranking</text>
|
| 449 |
+
<text x="985" y="533" text-anchor="middle" class="arch-small">Mistral 24B indices</text>
|
| 450 |
+
|
| 451 |
+
<path d="M205 352 C190 398, 188 438, 215 480" class="arch-line" />
|
| 452 |
+
<path d="M465 352 C390 402, 330 435, 275 480" class="arch-line" />
|
| 453 |
+
<path d="M350 518 L465 518" class="arch-line" />
|
| 454 |
+
<path d="M735 518 L850 518" class="arch-line" />
|
| 455 |
+
<path d="M735 352 C710 404, 660 440, 630 480" class="arch-soft-line" />
|
| 456 |
+
<path d="M995 352 C950 404, 760 438, 670 480" class="arch-soft-line" />
|
| 457 |
+
<path d="M735 352 C785 405, 840 440, 900 480" class="arch-soft-line" />
|
| 458 |
+
<path d="M995 352 C1010 402, 970 448, 940 480" class="arch-soft-line" />
|
| 459 |
<text x="990" y="455" text-anchor="middle" class="arch-small">top candidates</text>
|
| 460 |
|
| 461 |
+
<rect x="455" y="650" width="290" height="76" class="arch-box arch-deterministic" />
|
| 462 |
+
<text x="600" y="680" text-anchor="middle" class="arch-title">Final Tag Merge</text>
|
| 463 |
+
<text x="600" y="703" text-anchor="middle" class="arch-small">scene, classifier auto, implications</text>
|
| 464 |
|
| 465 |
+
<rect x="450" y="800" width="300" height="76" class="arch-box arch-output" />
|
| 466 |
+
<text x="600" y="830" text-anchor="middle" class="arch-title">Editable output</text>
|
| 467 |
+
<text x="600" y="853" text-anchor="middle" class="arch-small">ranked rows and suggested prompt</text>
|
| 468 |
|
| 469 |
+
<path d="M985 556 C900 618, 760 636, 630 650" class="arch-line" />
|
| 470 |
+
<path d="M735 352 C700 490, 620 570, 600 650" class="arch-soft-line" />
|
| 471 |
+
<path d="M995 352 C1045 505, 795 610, 630 650" class="arch-soft-line" />
|
| 472 |
+
<path d="M600 726 L600 800" class="arch-line" />
|
| 473 |
|
| 474 |
+
<text x="820" y="392" text-anchor="middle" class="arch-small">context and auto-tags</text>
|
| 475 |
+
<text x="760" y="625" text-anchor="middle" class="arch-small">selected tags</text>
|
| 476 |
</svg>
|
| 477 |
</div>
|
| 478 |
"""
|
|
|
|
| 3128 |
log("Step 2: Prompt Squirrel retrieval (hidden)")
|
| 3129 |
try:
|
| 3130 |
t0 = time.perf_counter()
|
| 3131 |
+
retrieval_context_tags = list(
|
| 3132 |
+
dict.fromkeys(
|
| 3133 |
+
(structural_tags or [])
|
| 3134 |
+
+ (probe_tags or [])
|
| 3135 |
+
+ (classifier_auto_tags or [])
|
| 3136 |
+
)
|
| 3137 |
+
)
|
| 3138 |
rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
|
| 3139 |
retrieval_result = psq_candidates_from_rewrite_phrases(
|
| 3140 |
rewrite_phrases=rewrite_phrases,
|
docs/retrieval_contract.md
CHANGED
|
@@ -27,8 +27,8 @@ Notes:
|
|
| 27 |
- `PSQ_RETRIEVAL_PER_PHRASE_K` (default 10)
|
| 28 |
- `PSQ_RETRIEVAL_PER_PHRASE_FINAL_K` (default 1)
|
| 29 |
- `PSQ_MIN_TAG_COUNT` (default 100 in app path)
|
| 30 |
-
- Stage 2 may be called with
|
| 31 |
-
- In app orchestration, classifier candidate tags can be injected into the candidate pool after Stage 2 retrieval and before Stage 3 selection.
|
| 32 |
|
| 33 |
---
|
| 34 |
|
|
|
|
| 27 |
- `PSQ_RETRIEVAL_PER_PHRASE_K` (default 10)
|
| 28 |
- `PSQ_RETRIEVAL_PER_PHRASE_FINAL_K` (default 1)
|
| 29 |
- `PSQ_MIN_TAG_COUNT` (default 100 in app path)
|
| 30 |
+
- Stage 2 may be called with Scene Composition tags and high-confidence Tag Classifier tags in `context_tags` to improve TF-IDF/SVD context scoring.
|
| 31 |
+
- In app orchestration, lower-confidence classifier candidate tags can be injected into the candidate pool after Stage 2 retrieval and before Stage 3 selection.
|
| 32 |
|
| 33 |
---
|
| 34 |
|
docs/rewrite_contract.md
CHANGED
|
@@ -51,7 +51,7 @@ The OpenRouter rewrite helper uses one deterministic call:
|
|
| 51 |
Model/auth endpoint behavior comes from `openrouter_client.py`:
|
| 52 |
- OpenRouter endpoint: `/chat/completions`
|
| 53 |
- API key from `OPENROUTER_API_KEY`
|
| 54 |
-
- model from `OPENROUTER_MODEL` (default `
|
| 55 |
|
| 56 |
---
|
| 57 |
|
|
|
|
| 51 |
Model/auth endpoint behavior comes from `openrouter_client.py`:
|
| 52 |
- OpenRouter endpoint: `/chat/completions`
|
| 53 |
- API key from `OPENROUTER_API_KEY`
|
| 54 |
+
- model from `OPENROUTER_MODEL` (default `mistralai/mistral-small-24b-instruct-2501`).
|
| 55 |
|
| 56 |
---
|
| 57 |
|
docs/space_overview.md
CHANGED
|
@@ -24,17 +24,21 @@ Design goals:
|
|
| 24 |
- `Tag Classifier`:
|
| 25 |
Runs a local multi-label classifier over the user prompt. Tags above calibrated high-precision thresholds are added automatically, and the next highest-scoring tags are added to the reranker candidate pool.
|
| 26 |
- `Semantic Retrieval`:
|
| 27 |
-
Uses reformulated phrases and lexical matches to pull a high-recall candidate pool from the fixed vocabulary using FastText/HNSW
|
|
|
|
|
|
|
| 28 |
- `Candidate Ranking`:
|
| 29 |
Runs an LLM call that can only choose from the retrieved candidate list, including classifier-suggested candidates. It cannot invent new tags.
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
After
|
| 32 |
|
| 33 |
## Design Rationale
|
| 34 |
|
| 35 |
- Query Reformulation and Semantic Retrieval are separate so search phrase generation stays flexible while candidate generation stays deterministic.
|
| 36 |
- Lexical Matching protects obvious prompt words that are already canonical tags or aliases, especially for rarer tags where the rewrite model may miss the term.
|
| 37 |
-
- Semantic Retrieval uses fast nearest-neighbor search
|
| 38 |
- Candidate Ranking is constrained to a high-quality candidate set so the ranking LLM cannot invent tags.
|
| 39 |
- Scene Composition and Tag Classifier run in parallel with Query Reformulation so they can add context and high-confidence tags without adding much latency.
|
| 40 |
- Users control the final prompt by toggling suggested tags on/off; the prompt text is generated from those toggle states.
|
|
@@ -86,7 +90,7 @@ This keeps the extracted wiki file immutable while allowing targeted manual fixe
|
|
| 86 |
- Local `t5-small` encoder-decoder (text-to-text seq2seq) model for query reformulation.
|
| 87 |
- Local ModernBERT multi-label classifier for calibrated high-confidence tags and extra reranker candidates.
|
| 88 |
- OpenRouter-served instruction LLMs for structural inference and closed-set selection.
|
| 89 |
-
Default model: `
|
| 90 |
- Gradio for the interactive web UI (tag toggles, ranked rows, and suggested prompt text).
|
| 91 |
- Python pipeline orchestration with CSV/JSON data sources and implication-graph expansion.
|
| 92 |
|
|
@@ -94,11 +98,7 @@ This keeps the extracted wiki file immutable while allowing targeted manual fixe
|
|
| 94 |
|
| 95 |
Current evaluation style compares selected tags against ground-truth tags on caption-evident samples.
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
- Precision: `TP / (TP + FP)`
|
| 100 |
-
- Recall: `TP / (TP + FN)`
|
| 101 |
-
- F1: harmonic mean of precision/recall
|
| 102 |
|
| 103 |
The evaluation focus is practical:
|
| 104 |
|
|
@@ -121,7 +121,7 @@ Summary from `data/analysis/rewrite_ablation_n30_e2e_ndcg_20260509.json`:
|
|
| 121 |
|
| 122 |
## System Snapshot
|
| 123 |
|
| 124 |
-
Prompt Squirrel maps unstructured text into a closed, editable tag vocabulary under practical latency constraints. The online path runs Query Reformulation, Lexical Matching, Scene Composition, and Tag Classifier inference before Semantic Retrieval builds the candidate pool. Candidate Ranking selects from that pool, after which high-confidence Scene Composition tags, calibrated Tag Classifier tags, and implication rules
|
| 125 |
|
| 126 |
## Evaluation Dataset Snapshot
|
| 127 |
|
|
|
|
| 24 |
- `Tag Classifier`:
|
| 25 |
Runs a local multi-label classifier over the user prompt. Tags above calibrated high-precision thresholds are added automatically, and the next highest-scoring tags are added to the reranker candidate pool.
|
| 26 |
- `Semantic Retrieval`:
|
| 27 |
+
Uses reformulated phrases and lexical matches to pull a high-recall candidate pool from the fixed vocabulary using FastText/HNSW.
|
| 28 |
+
- `Context Rescoring`:
|
| 29 |
+
Reweights and deduplicates retrieved candidates using TF-IDF/SVD context from the query, Scene Composition tags, and high-confidence Tag Classifier tags before final ranking.
|
| 30 |
- `Candidate Ranking`:
|
| 31 |
Runs an LLM call that can only choose from the retrieved candidate list, including classifier-suggested candidates. It cannot invent new tags.
|
| 32 |
+
- `Final Tag Merge`:
|
| 33 |
+
Merges Candidate Ranking selections with high-confidence Scene Composition tags, high-confidence Tag Classifier tags, and implication-rule additions before building the editable output.
|
| 34 |
|
| 35 |
+
After Final Tag Merge, Prompt Squirrel presents editable rows plus the final suggested prompt.
|
| 36 |
|
| 37 |
## Design Rationale
|
| 38 |
|
| 39 |
- Query Reformulation and Semantic Retrieval are separate so search phrase generation stays flexible while candidate generation stays deterministic.
|
| 40 |
- Lexical Matching protects obvious prompt words that are already canonical tags or aliases, especially for rarer tags where the rewrite model may miss the term.
|
| 41 |
+
- Semantic Retrieval uses fast nearest-neighbor search to maximize recall, then Context Rescoring uses lightweight context signals before Candidate Ranking sharpens the final selection.
|
| 42 |
- Candidate Ranking is constrained to a high-quality candidate set so the ranking LLM cannot invent tags.
|
| 43 |
- Scene Composition and Tag Classifier run in parallel with Query Reformulation so they can add context and high-confidence tags without adding much latency.
|
| 44 |
- Users control the final prompt by toggling suggested tags on/off; the prompt text is generated from those toggle states.
|
|
|
|
| 90 |
- Local `t5-small` encoder-decoder (text-to-text seq2seq) model for query reformulation.
|
| 91 |
- Local ModernBERT multi-label classifier for calibrated high-confidence tags and extra reranker candidates.
|
| 92 |
- OpenRouter-served instruction LLMs for structural inference and closed-set selection.
|
| 93 |
+
Default model: `mistralai/mistral-small-24b-instruct-2501` (configurable).
|
| 94 |
- Gradio for the interactive web UI (tag toggles, ranked rows, and suggested prompt text).
|
| 95 |
- Python pipeline orchestration with CSV/JSON data sources and implication-graph expansion.
|
| 96 |
|
|
|
|
| 98 |
|
| 99 |
Current evaluation style compares selected tags against ground-truth tags on caption-evident samples.
|
| 100 |
|
| 101 |
+
F1 is the primary end-to-end metric.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
The evaluation focus is practical:
|
| 104 |
|
|
|
|
| 121 |
|
| 122 |
## System Snapshot
|
| 123 |
|
| 124 |
+
Prompt Squirrel maps unstructured text into a closed, editable tag vocabulary under practical latency constraints. The online path runs Query Reformulation, Lexical Matching, Scene Composition, and Tag Classifier inference before Semantic Retrieval builds the candidate pool and Context Rescoring reweights it. Candidate Ranking selects from that pool, after which Final Tag Merge combines high-confidence Scene Composition tags, calibrated Tag Classifier tags, and implication rules into the editable output. Current N=30 results show materially higher retrieval quality and end-to-end F1 after moving Query Reformulation to the local T5 model and adding Lexical Matching plus classifier candidates.
|
| 125 |
|
| 126 |
## Evaluation Dataset Snapshot
|
| 127 |
|
scripts/eval_pipeline.py
CHANGED
|
@@ -408,7 +408,6 @@ def _process_one_sample(
|
|
| 408 |
result.structural_tags
|
| 409 |
+ result.probe_tags
|
| 410 |
+ result.classifier_auto_tags
|
| 411 |
-
+ result.classifier_candidate_tags
|
| 412 |
)
|
| 413 |
)
|
| 414 |
retrieval_result = psq_candidates_from_rewrite_phrases(
|
|
|
|
| 408 |
result.structural_tags
|
| 409 |
+ result.probe_tags
|
| 410 |
+ result.classifier_auto_tags
|
|
|
|
| 411 |
)
|
| 412 |
)
|
| 413 |
retrieval_result = psq_candidates_from_rewrite_phrases(
|