Food Desert commited on
Commit
5c070ef
·
1 Parent(s): 3127166

Update pipeline diagram and classifier context rescoring

Browse files
app.py CHANGED
@@ -385,7 +385,7 @@ def _build_arch_diagram_html() -> str:
385
  return """
386
  <div class="arch-diagram-wrap">
387
  <h2>Architecture At A Glance</h2>
388
- <svg class="arch-flow" viewBox="0 0 1200 820" role="img" aria-label="Prompt Squirrel pipeline diagram showing query reformulation, lexical matching, scene composition, tag classification, semantic retrieval, candidate ranking, and editable output">
389
  <defs>
390
  <marker id="arch-arrowhead" markerWidth="10" markerHeight="8" refX="9" refY="4" orient="auto" markerUnits="strokeWidth">
391
  <path d="M0,0 L10,4 L0,8 Z" fill="#334155" />
@@ -436,37 +436,43 @@ def _build_arch_diagram_html() -> str:
436
  <path d="M600 174 C652 210, 700 224, 735 280" class="arch-soft-line" />
437
  <path d="M600 174 C760 202, 916 210, 995 280" class="arch-soft-line" />
438
 
439
- <rect x="140" y="480" width="360" height="76" class="arch-box arch-retrieval" />
440
- <text x="320" y="510" text-anchor="middle" class="arch-title">Semantic Retrieval</text>
441
- <text x="320" y="533" text-anchor="middle" class="arch-small">FastText/HNSW plus context scoring</text>
442
-
443
- <rect x="700" y="480" width="340" height="76" class="arch-box arch-llm" />
444
- <text x="870" y="510" text-anchor="middle" class="arch-title">Candidate Ranking</text>
445
- <text x="870" y="533" text-anchor="middle" class="arch-small">Mistral 24B candidate indices</text>
446
-
447
- <path d="M205 352 C200 394, 240 438, 290 480" class="arch-line" />
448
- <path d="M465 352 C418 402, 370 428, 340 480" class="arch-line" />
449
- <path d="M500 518 L700 518" class="arch-line" />
450
- <path d="M735 352 C650 398, 498 438, 395 480" class="arch-soft-line" />
451
- <path d="M735 352 C774 408, 820 448, 855 480" class="arch-soft-line" />
452
- <path d="M995 352 C1010 402, 965 448, 910 480" class="arch-soft-line" />
 
 
 
 
 
 
453
  <text x="990" y="455" text-anchor="middle" class="arch-small">top candidates</text>
454
 
455
- <rect x="320" y="640" width="290" height="76" class="arch-box arch-deterministic" />
456
- <text x="465" y="670" text-anchor="middle" class="arch-title">Deterministic additions</text>
457
- <text x="465" y="693" text-anchor="middle" class="arch-small">scene, classifier auto, implications</text>
458
 
459
- <rect x="720" y="640" width="300" height="76" class="arch-box arch-output" />
460
- <text x="870" y="670" text-anchor="middle" class="arch-title">Editable output</text>
461
- <text x="870" y="693" text-anchor="middle" class="arch-small">ranked rows and suggested prompt</text>
462
 
463
- <path d="M870 556 C820 604, 704 620, 590 650" class="arch-line" />
464
- <path d="M735 352 C660 470, 530 544, 477 640" class="arch-soft-line" />
465
- <path d="M995 352 C1038 486, 726 568, 508 640" class="arch-soft-line" />
466
- <path d="M610 678 L720 678" class="arch-line" />
467
 
468
- <text x="736" y="392" text-anchor="middle" class="arch-small">context and auto-tags</text>
469
- <text x="570" y="622" text-anchor="middle" class="arch-small">selected tags</text>
470
  </svg>
471
  </div>
472
  """
@@ -3122,7 +3128,13 @@ def rag_pipeline_ui(
3122
  log("Step 2: Prompt Squirrel retrieval (hidden)")
3123
  try:
3124
  t0 = time.perf_counter()
3125
- retrieval_context_tags = list(dict.fromkeys((structural_tags or []) + (probe_tags or [])))
 
 
 
 
 
 
3126
  rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
3127
  retrieval_result = psq_candidates_from_rewrite_phrases(
3128
  rewrite_phrases=rewrite_phrases,
 
385
  return """
386
  <div class="arch-diagram-wrap">
387
  <h2>Architecture At A Glance</h2>
388
+ <svg class="arch-flow" viewBox="0 0 1200 940" role="img" aria-label="Prompt Squirrel pipeline diagram showing query reformulation, lexical matching, scene composition, tag classification, semantic retrieval, context rescoring, candidate ranking, and editable output">
389
  <defs>
390
  <marker id="arch-arrowhead" markerWidth="10" markerHeight="8" refX="9" refY="4" orient="auto" markerUnits="strokeWidth">
391
  <path d="M0,0 L10,4 L0,8 Z" fill="#334155" />
 
436
  <path d="M600 174 C652 210, 700 224, 735 280" class="arch-soft-line" />
437
  <path d="M600 174 C760 202, 916 210, 995 280" class="arch-soft-line" />
438
 
439
+ <rect x="80" y="480" width="270" height="76" class="arch-box arch-retrieval" />
440
+ <text x="215" y="510" text-anchor="middle" class="arch-title">Semantic Retrieval</text>
441
+ <text x="215" y="533" text-anchor="middle" class="arch-small">FastText/HNSW candidates</text>
442
+
443
+ <rect x="465" y="480" width="270" height="76" class="arch-box arch-retrieval" />
444
+ <text x="600" y="510" text-anchor="middle" class="arch-title">Context Rescoring</text>
445
+ <text x="600" y="533" text-anchor="middle" class="arch-small">TF-IDF/SVD context</text>
446
+
447
+ <rect x="850" y="480" width="270" height="76" class="arch-box arch-llm" />
448
+ <text x="985" y="510" text-anchor="middle" class="arch-title">Candidate Ranking</text>
449
+ <text x="985" y="533" text-anchor="middle" class="arch-small">Mistral 24B indices</text>
450
+
451
+ <path d="M205 352 C190 398, 188 438, 215 480" class="arch-line" />
452
+ <path d="M465 352 C390 402, 330 435, 275 480" class="arch-line" />
453
+ <path d="M350 518 L465 518" class="arch-line" />
454
+ <path d="M735 518 L850 518" class="arch-line" />
455
+ <path d="M735 352 C710 404, 660 440, 630 480" class="arch-soft-line" />
456
+ <path d="M995 352 C950 404, 760 438, 670 480" class="arch-soft-line" />
457
+ <path d="M735 352 C785 405, 840 440, 900 480" class="arch-soft-line" />
458
+ <path d="M995 352 C1010 402, 970 448, 940 480" class="arch-soft-line" />
459
  <text x="990" y="455" text-anchor="middle" class="arch-small">top candidates</text>
460
 
461
+ <rect x="455" y="650" width="290" height="76" class="arch-box arch-deterministic" />
462
+ <text x="600" y="680" text-anchor="middle" class="arch-title">Final Tag Merge</text>
463
+ <text x="600" y="703" text-anchor="middle" class="arch-small">scene, classifier auto, implications</text>
464
 
465
+ <rect x="450" y="800" width="300" height="76" class="arch-box arch-output" />
466
+ <text x="600" y="830" text-anchor="middle" class="arch-title">Editable output</text>
467
+ <text x="600" y="853" text-anchor="middle" class="arch-small">ranked rows and suggested prompt</text>
468
 
469
+ <path d="M985 556 C900 618, 760 636, 630 650" class="arch-line" />
470
+ <path d="M735 352 C700 490, 620 570, 600 650" class="arch-soft-line" />
471
+ <path d="M995 352 C1045 505, 795 610, 630 650" class="arch-soft-line" />
472
+ <path d="M600 726 L600 800" class="arch-line" />
473
 
474
+ <text x="820" y="392" text-anchor="middle" class="arch-small">context and auto-tags</text>
475
+ <text x="760" y="625" text-anchor="middle" class="arch-small">selected tags</text>
476
  </svg>
477
  </div>
478
  """
 
3128
  log("Step 2: Prompt Squirrel retrieval (hidden)")
3129
  try:
3130
  t0 = time.perf_counter()
3131
+ retrieval_context_tags = list(
3132
+ dict.fromkeys(
3133
+ (structural_tags or [])
3134
+ + (probe_tags or [])
3135
+ + (classifier_auto_tags or [])
3136
+ )
3137
+ )
3138
  rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
3139
  retrieval_result = psq_candidates_from_rewrite_phrases(
3140
  rewrite_phrases=rewrite_phrases,
docs/retrieval_contract.md CHANGED
@@ -27,8 +27,8 @@ Notes:
27
  - `PSQ_RETRIEVAL_PER_PHRASE_K` (default 10)
28
  - `PSQ_RETRIEVAL_PER_PHRASE_FINAL_K` (default 1)
29
  - `PSQ_MIN_TAG_COUNT` (default 100 in app path)
30
- - Stage 2 may be called with structural tags in `context_tags` to improve context scoring.
31
- - In app orchestration, classifier candidate tags can be injected into the candidate pool after Stage 2 retrieval and before Stage 3 selection.
32
 
33
  ---
34
 
 
27
  - `PSQ_RETRIEVAL_PER_PHRASE_K` (default 10)
28
  - `PSQ_RETRIEVAL_PER_PHRASE_FINAL_K` (default 1)
29
  - `PSQ_MIN_TAG_COUNT` (default 100 in app path)
30
+ - Stage 2 may be called with Scene Composition tags and high-confidence Tag Classifier tags in `context_tags` to improve TF-IDF/SVD context scoring.
31
+ - In app orchestration, lower-confidence classifier candidate tags can be injected into the candidate pool after Stage 2 retrieval and before Stage 3 selection.
32
 
33
  ---
34
 
docs/rewrite_contract.md CHANGED
@@ -51,7 +51,7 @@ The OpenRouter rewrite helper uses one deterministic call:
51
  Model/auth endpoint behavior comes from `openrouter_client.py`:
52
  - OpenRouter endpoint: `/chat/completions`
53
  - API key from `OPENROUTER_API_KEY`
54
- - model from `OPENROUTER_MODEL` (default `meta-llama/llama-3.1-8b-instruct`).
55
 
56
  ---
57
 
 
51
  Model/auth endpoint behavior comes from `openrouter_client.py`:
52
  - OpenRouter endpoint: `/chat/completions`
53
  - API key from `OPENROUTER_API_KEY`
54
+ - model from `OPENROUTER_MODEL` (default `mistralai/mistral-small-24b-instruct-2501`).
55
 
56
  ---
57
 
docs/space_overview.md CHANGED
@@ -24,17 +24,21 @@ Design goals:
24
  - `Tag Classifier`:
25
  Runs a local multi-label classifier over the user prompt. Tags above calibrated high-precision thresholds are added automatically, and the next highest-scoring tags are added to the reranker candidate pool.
26
  - `Semantic Retrieval`:
27
- Uses reformulated phrases and lexical matches to pull a high-recall candidate pool from the fixed vocabulary using FastText/HNSW, then applies lightweight context scoring and deduplication before ranking.
 
 
28
  - `Candidate Ranking`:
29
  Runs an LLM call that can only choose from the retrieved candidate list, including classifier-suggested candidates. It cannot invent new tags.
 
 
30
 
31
- After Candidate Ranking, Prompt Squirrel applies deterministic implication expansion, merges in high-confidence Scene Composition and Tag Classifier tags, and presents editable rows plus the final suggested prompt.
32
 
33
  ## Design Rationale
34
 
35
  - Query Reformulation and Semantic Retrieval are separate so search phrase generation stays flexible while candidate generation stays deterministic.
36
  - Lexical Matching protects obvious prompt words that are already canonical tags or aliases, especially for rarer tags where the rewrite model may miss the term.
37
- - Semantic Retrieval uses fast nearest-neighbor search plus lightweight context scoring to maximize recall before Candidate Ranking sharpens the final selection.
38
  - Candidate Ranking is constrained to a high-quality candidate set so the ranking LLM cannot invent tags.
39
  - Scene Composition and Tag Classifier run in parallel with Query Reformulation so they can add context and high-confidence tags without adding much latency.
40
  - Users control the final prompt by toggling suggested tags on/off; the prompt text is generated from those toggle states.
@@ -86,7 +90,7 @@ This keeps the extracted wiki file immutable while allowing targeted manual fixe
86
  - Local `t5-small` encoder-decoder (text-to-text seq2seq) model for query reformulation.
87
  - Local ModernBERT multi-label classifier for calibrated high-confidence tags and extra reranker candidates.
88
  - OpenRouter-served instruction LLMs for structural inference and closed-set selection.
89
- Default model: `meta-llama/llama-3.1-8b-instruct` (configurable).
90
  - Gradio for the interactive web UI (tag toggles, ranked rows, and suggested prompt text).
91
  - Python pipeline orchestration with CSV/JSON data sources and implication-graph expansion.
92
 
@@ -94,11 +98,7 @@ This keeps the extracted wiki file immutable while allowing targeted manual fixe
94
 
95
  Current evaluation style compares selected tags against ground-truth tags on caption-evident samples.
96
 
97
- Primary metrics:
98
-
99
- - Precision: `TP / (TP + FP)`
100
- - Recall: `TP / (TP + FN)`
101
- - F1: harmonic mean of precision/recall
102
 
103
  The evaluation focus is practical:
104
 
@@ -121,7 +121,7 @@ Summary from `data/analysis/rewrite_ablation_n30_e2e_ndcg_20260509.json`:
121
 
122
  ## System Snapshot
123
 
124
- Prompt Squirrel maps unstructured text into a closed, editable tag vocabulary under practical latency constraints. The online path runs Query Reformulation, Lexical Matching, Scene Composition, and Tag Classifier inference before Semantic Retrieval builds the candidate pool. Candidate Ranking selects from that pool, after which high-confidence Scene Composition tags, calibrated Tag Classifier tags, and implication rules are merged into the editable output. Current N=30 results show materially higher retrieval quality and end-to-end F1 after moving Query Reformulation to the local T5 model and adding Lexical Matching plus classifier candidates.
125
 
126
  ## Evaluation Dataset Snapshot
127
 
 
24
  - `Tag Classifier`:
25
  Runs a local multi-label classifier over the user prompt. Tags above calibrated high-precision thresholds are added automatically, and the next highest-scoring tags are added to the reranker candidate pool.
26
  - `Semantic Retrieval`:
27
+ Uses reformulated phrases and lexical matches to pull a high-recall candidate pool from the fixed vocabulary using FastText/HNSW.
28
+ - `Context Rescoring`:
29
+ Reweights and deduplicates retrieved candidates using TF-IDF/SVD context from the query, Scene Composition tags, and high-confidence Tag Classifier tags before final ranking.
30
  - `Candidate Ranking`:
31
  Runs an LLM call that can only choose from the retrieved candidate list, including classifier-suggested candidates. It cannot invent new tags.
32
+ - `Final Tag Merge`:
33
+ Merges Candidate Ranking selections with high-confidence Scene Composition tags, high-confidence Tag Classifier tags, and implication-rule additions before building the editable output.
34
 
35
+ After Final Tag Merge, Prompt Squirrel presents editable rows plus the final suggested prompt.
36
 
37
  ## Design Rationale
38
 
39
  - Query Reformulation and Semantic Retrieval are separate so search phrase generation stays flexible while candidate generation stays deterministic.
40
  - Lexical Matching protects obvious prompt words that are already canonical tags or aliases, especially for rarer tags where the rewrite model may miss the term.
41
+ - Semantic Retrieval uses fast nearest-neighbor search to maximize recall, then Context Rescoring uses lightweight context signals before Candidate Ranking sharpens the final selection.
42
  - Candidate Ranking is constrained to a high-quality candidate set so the ranking LLM cannot invent tags.
43
  - Scene Composition and Tag Classifier run in parallel with Query Reformulation so they can add context and high-confidence tags without adding much latency.
44
  - Users control the final prompt by toggling suggested tags on/off; the prompt text is generated from those toggle states.
 
90
  - Local `t5-small` encoder-decoder (text-to-text seq2seq) model for query reformulation.
91
  - Local ModernBERT multi-label classifier for calibrated high-confidence tags and extra reranker candidates.
92
  - OpenRouter-served instruction LLMs for structural inference and closed-set selection.
93
+ Default model: `mistralai/mistral-small-24b-instruct-2501` (configurable).
94
  - Gradio for the interactive web UI (tag toggles, ranked rows, and suggested prompt text).
95
  - Python pipeline orchestration with CSV/JSON data sources and implication-graph expansion.
96
 
 
98
 
99
  Current evaluation style compares selected tags against ground-truth tags on caption-evident samples.
100
 
101
+ F1 is the primary end-to-end metric.
 
 
 
 
102
 
103
  The evaluation focus is practical:
104
 
 
121
 
122
  ## System Snapshot
123
 
124
+ Prompt Squirrel maps unstructured text into a closed, editable tag vocabulary under practical latency constraints. The online path runs Query Reformulation, Lexical Matching, Scene Composition, and Tag Classifier inference before Semantic Retrieval builds the candidate pool and Context Rescoring reweights it. Candidate Ranking selects from that pool, after which Final Tag Merge combines high-confidence Scene Composition tags, calibrated Tag Classifier tags, and implication rules into the editable output. Current N=30 results show materially higher retrieval quality and end-to-end F1 after moving Query Reformulation to the local T5 model and adding Lexical Matching plus classifier candidates.
125
 
126
  ## Evaluation Dataset Snapshot
127
 
scripts/eval_pipeline.py CHANGED
@@ -408,7 +408,6 @@ def _process_one_sample(
408
  result.structural_tags
409
  + result.probe_tags
410
  + result.classifier_auto_tags
411
- + result.classifier_candidate_tags
412
  )
413
  )
414
  retrieval_result = psq_candidates_from_rewrite_phrases(
 
408
  result.structural_tags
409
  + result.probe_tags
410
  + result.classifier_auto_tags
 
411
  )
412
  )
413
  retrieval_result = psq_candidates_from_rewrite_phrases(