Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

App Files Files Community

Food Desert commited on May 11

Commit

29b12cd

1 Parent(s): c3191c6

Add exact n-gram retrieval query hints

Browse files

Files changed (5) hide show

.gitignore +8 -0
app.py +37 -8
docs/rewrite_contract.md +5 -0
psq_rag/pipeline/preproc.py +74 -10
scripts/test_exact_tag_query_phrases.py +104 -0

.gitignore CHANGED Viewed

@@ -29,6 +29,10 @@ data/analysis/openrouter_concurrency_*.json
 data/analysis/pipeline_call_count_probe*.json
 data/analysis/rewrite_only_compare_*.json
 data/analysis/rewrite_ablation_*.json
 data/analysis/t5_sweep_two_stage_*.json
 data/analysis/t5_sweep_two_stage_*.csv
 data/analysis/tmp_ckpt_compare_*.json
@@ -46,3 +50,7 @@ data/eval_results/tmp_llm_rewrite_diag*.jsonl
 data/eval_results/eval_caption_cogvlm_n30_llm_heur_*_20260509.jsonl
 data/eval_results/eval_caption_cogvlm_n30_t5_heur_*_20260509.jsonl
 data/eval_results/eval_caption_cogvlm_n1_seed42_20260509_005007.jsonl

 data/analysis/pipeline_call_count_probe*.json
 data/analysis/rewrite_only_compare_*.json
 data/analysis/rewrite_ablation_*.json
+data/analysis/retrieval_ngram_recovery_*.json
+data/analysis/retrieval_ngram_recovery_*.csv
+data/analysis/t5_tag_frequency_profile_*.json
+data/analysis/t5_tag_frequency_profile_*.csv
 data/analysis/t5_sweep_two_stage_*.json
 data/analysis/t5_sweep_two_stage_*.csv
 data/analysis/tmp_ckpt_compare_*.json
 data/eval_results/eval_caption_cogvlm_n30_llm_heur_*_20260509.jsonl
 data/eval_results/eval_caption_cogvlm_n30_t5_heur_*_20260509.jsonl
 data/eval_results/eval_caption_cogvlm_n1_seed42_20260509_005007.jsonl
+# Temporary local profiling helpers
+scripts/profile_retrieval_ngram_recovery.py
+scripts/profile_t5_tag_frequency.py

app.py CHANGED Viewed

@@ -78,7 +78,10 @@ if _STARTUP_PROFILE_ON and _STARTUP_PROFILE_PATH is not None:
 import gradio as gr
 _startup_profile_mark("import.gradio.done")
-from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
 _startup_profile_mark("import.psq_rag.pipeline.preproc.done")
 from psq_rag.llm.rewrite import llm_rewrite_prompt
 _startup_profile_mark("import.psq_rag.llm.rewrite.done")
@@ -93,6 +96,7 @@ from psq_rag.retrieval.state import (
     get_tag_type_name,
     get_tag_implications,
     get_tag_counts,
 )
 _startup_profile_mark("import.psq_rag.retrieval.state.done")
 from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
@@ -1474,9 +1478,10 @@ display_top_groups_default = int(os.environ.get("PSQ_DISPLAY_TOP_GROUPS", "10"))
 display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "7"))
 display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "7"))
 display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
-retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
-retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
 retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
 selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
 selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
 selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
@@ -2360,6 +2365,7 @@ def rag_pipeline_ui(
             f"retrieval_global_k={retrieval_global_k} "
             f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
             f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
             f"selection_mode={selection_mode} "
             f"selection_chunk_size={selection_chunk_size} "
             f"selection_per_phrase_k={selection_per_phrase_k} "
@@ -2386,6 +2392,14 @@ def rag_pipeline_ui(
         user_tags_raw = extract_user_provided_tags_upto_3_words(prompt_in)
         user_tags, removed_user_low = _filter_min_count_tags(user_tags_raw, min_tag_count)
         user_tags, removed_user_excluded = _filter_excluded_recommendation_tags(user_tags)
         dt = time.perf_counter()-t0
         _record_timing("preprocess", dt)
         log(f"Preprocess (user tag extraction): {dt:.2f}s")
@@ -2404,6 +2418,20 @@ def rag_pipeline_ui(
                 f"Filtered {len(removed_user_excluded)} excluded user tags: "
                 f"{', '.join(removed_user_excluded)}"
             )
         log("")
         rewrite_prefilled = (rewrite_override or "").strip()
@@ -2489,11 +2517,12 @@ def rag_pipeline_ui(
         log("Rewrite:")
         log(rewritten if rewritten else "(empty)")
         log("")
-        rewrite_for_retrieval = rewritten
-        if user_tags:
-            # keep them separate in logs, but allow them to help retrieval
-            rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
         log("Step 2: Prompt Squirrel retrieval (hidden)")

 import gradio as gr
 _startup_profile_mark("import.gradio.done")
+from psq_rag.pipeline.preproc import (
+    extract_exact_tag_query_phrases,
+    extract_user_provided_tags_upto_3_words,
+)
 _startup_profile_mark("import.psq_rag.pipeline.preproc.done")
 from psq_rag.llm.rewrite import llm_rewrite_prompt
 _startup_profile_mark("import.psq_rag.llm.rewrite.done")
     get_tag_type_name,
     get_tag_implications,
     get_tag_counts,
+    get_alias2tags,
 )
 _startup_profile_mark("import.psq_rag.retrieval.state.done")
 from psq_rag.ui.group_ranked_display import rank_groups_from_tfidf, _load_enabled_groups
 display_top_tags_per_group_default = int(os.environ.get("PSQ_DISPLAY_TOP_TAGS_PER_GROUP", "7"))
 display_rank_top_k_default = int(os.environ.get("PSQ_DISPLAY_GROUP_RANK_TOP_K", "7"))
 display_max_rows_default = int(os.environ.get("PSQ_DISPLAY_MAX_ROWS", "14"))
+retrieval_global_k = int(os.environ.get("PSQ_RETRIEVAL_GLOBAL_K", "300"))
+retrieval_per_phrase_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_K", "10"))
 retrieval_per_phrase_final_k = int(os.environ.get("PSQ_RETRIEVAL_PER_PHRASE_FINAL_K", "1"))
+retrieval_exact_ngram_max = int(os.environ.get("PSQ_RETRIEVAL_EXACT_NGRAM_MAX", "2"))
 selection_mode = os.environ.get("PSQ_SELECTION_MODE", "chunked_map_union").strip()
 selection_chunk_size = int(os.environ.get("PSQ_SELECTION_CHUNK_SIZE", "60"))
 selection_per_phrase_k = int(os.environ.get("PSQ_SELECTION_PER_PHRASE_K", "2"))
             f"retrieval_global_k={retrieval_global_k} "
             f"retrieval_per_phrase_k={retrieval_per_phrase_k} "
             f"retrieval_per_phrase_final_k={retrieval_per_phrase_final_k} "
+            f"retrieval_exact_ngram_max={retrieval_exact_ngram_max} "
             f"selection_mode={selection_mode} "
             f"selection_chunk_size={selection_chunk_size} "
             f"selection_per_phrase_k={selection_per_phrase_k} "
         user_tags_raw = extract_user_provided_tags_upto_3_words(prompt_in)
         user_tags, removed_user_low = _filter_min_count_tags(user_tags_raw, min_tag_count)
         user_tags, removed_user_excluded = _filter_excluded_recommendation_tags(user_tags)
+        exact_query_phrases = extract_exact_tag_query_phrases(
+            prompt_in,
+            get_tag_counts(),
+            get_alias2tags(),
+            min_tag_count=min_tag_count,
+            max_ngram=max(0, retrieval_exact_ngram_max),
+        )
+        exact_query_phrases, removed_exact_excluded = _filter_excluded_recommendation_tags(exact_query_phrases)
         dt = time.perf_counter()-t0
         _record_timing("preprocess", dt)
         log(f"Preprocess (user tag extraction): {dt:.2f}s")
                 f"Filtered {len(removed_user_excluded)} excluded user tags: "
                 f"{', '.join(removed_user_excluded)}"
             )
+        if retrieval_exact_ngram_max > 0:
+            log(f"Exact caption tag query phrases (1-{retrieval_exact_ngram_max} grams):")
+        else:
+            log("Exact caption tag query phrases: disabled")
+        if exact_query_phrases:
+            shown = ", ".join(exact_query_phrases[:40])
+            log(shown + (" ..." if len(exact_query_phrases) > 40 else ""))
+        else:
+            log("(none)")
+        if removed_exact_excluded:
+            log(
+                f"Filtered {len(removed_exact_excluded)} excluded exact query phrases: "
+                f"{', '.join(removed_exact_excluded)}"
+            )
         log("")
         rewrite_prefilled = (rewrite_override or "").strip()
         log("Rewrite:")
         log(rewritten if rewritten else "(empty)")
         log("")
+        rewrite_for_retrieval = rewritten
+        retrieval_query_hints = list(dict.fromkeys((user_tags or []) + (exact_query_phrases or [])))
+        if retrieval_query_hints:
+            # keep them separate in logs, but allow them to help retrieval
+            rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(retrieval_query_hints)).strip(", ").strip()
         log("Step 2: Prompt Squirrel retrieval (hidden)")

docs/rewrite_contract.md CHANGED Viewed

@@ -76,6 +76,11 @@ Outside Stage 1 itself, `app.py` also computes heuristic short phrases via:
 - split on `.` and `,`
 - keep segments with <= 3 tokens
 - case-insensitive dedupe
 These heuristic terms are later appended to retrieval input only if rewrite succeeds.

 - split on `.` and `,`
 - keep segments with <= 3 tokens
 - case-insensitive dedupe
+- `extract_exact_tag_query_phrases()`
+- scan prompt text for exact 1- to N-gram canonical tag or alias matches
+  - app default N is 2 (`PSQ_RETRIEVAL_EXACT_NGRAM_MAX`)
+  - matches must resolve to at least one canonical tag that clears `PSQ_MIN_TAG_COUNT`
+  - longest matches suppress their own component unigrams
 These heuristic terms are later appended to retrieval input only if rewrite succeeds.

psq_rag/pipeline/preproc.py CHANGED Viewed

@@ -1,6 +1,11 @@
-import re
-def extract_user_provided_tags_upto_3_words(prompt_in: str) -> list[str]:
     """
     Heuristic:
     - split on '.' and ','
@@ -27,10 +32,69 @@ def extract_user_provided_tags_upto_3_words(prompt_in: str) -> list[str]:
             if key not in seen:
                 seen.add(key)
                 out.append(item)
-    return out
-if __name__ == "__main__":
-    print("preproc.py imports ok")

+import re
+from typing import Mapping, Sequence
+_TOKEN_RE = re.compile(r"[a-z0-9]+(?:'[a-z0-9]+)?")
+def extract_user_provided_tags_upto_3_words(prompt_in: str) -> list[str]:
     """
     Heuristic:
     - split on '.' and ','
             if key not in seen:
                 seen.add(key)
                 out.append(item)
+    return out
+def extract_exact_tag_query_phrases(
+    prompt_in: str,
+    tag_counts: Mapping[str, int],
+    alias2tags: Mapping[str, Sequence[str]],
+    *,
+    min_tag_count: int = 0,
+    max_ngram: int = 2,
+) -> list[str]:
+    """Extract exact canonical/alias n-gram matches as retrieval query phrases.
+    The output is conservative: every emitted phrase either is a canonical tag or
+    resolves through the alias map to at least one canonical tag that clears the
+    count floor. Longest matches win, so a matched 2-gram suppresses its own
+    component 1-grams.
+    """
+    if not prompt_in or max_ngram <= 0:
+        return []
+    text = prompt_in.strip()
+    prefix = "caption_to_tags:"
+    if text.lower().startswith(prefix):
+        text = text[len(prefix):].strip()
+    tokens = _TOKEN_RE.findall(text.lower())
+    if not tokens:
+        return []
+    def _count_ok(tag: str) -> bool:
+        if min_tag_count <= 0:
+            return True
+        return int(tag_counts.get(tag, 0) or 0) >= min_tag_count
+    def _resolves(lookup: str) -> bool:
+        if lookup in tag_counts:
+            return _count_ok(lookup)
+        return any(_count_ok(tag) for tag in alias2tags.get(lookup, ()))
+    matches: list[tuple[int, int, str]] = []
+    max_n = min(max(1, int(max_ngram)), len(tokens))
+    for n in range(max_n, 0, -1):
+        for start in range(0, len(tokens) - n + 1):
+            lookup = "_".join(tokens[start:start + n])
+            if _resolves(lookup):
+                matches.append((start, start + n, lookup))
+    used: set[int] = set()
+    selected: list[tuple[int, str]] = []
+    seen: set[str] = set()
+    for start, end, lookup in matches:
+        span = set(range(start, end))
+        if span & used or lookup in seen:
+            continue
+        used.update(span)
+        seen.add(lookup)
+        selected.append((start, lookup))
+    selected.sort(key=lambda row: row[0])
+    return [lookup for _, lookup in selected]
+if __name__ == "__main__":
+    print("preproc.py imports ok")

scripts/test_exact_tag_query_phrases.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from pathlib import Path
+import sys
+repo_root = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(repo_root))
+from psq_rag.pipeline.preproc import extract_exact_tag_query_phrases
+def assert_equal(actual, expected, message):
+    if actual != expected:
+        raise AssertionError(f"{message}: expected {expected!r}, got {actual!r}")
+def assert_in(item, values, message):
+    if item not in values:
+        raise AssertionError(f"{message}: {item!r} not in {values!r}")
+def test_longest_match_suppresses_component_unigrams():
+    tag_counts = {
+        "red": 1000,
+        "fox": 1000,
+        "red_fox": 300,
+        "burrito": 164,
+    }
+    phrases = extract_exact_tag_query_phrases(
+        "A red fox eating a giant burrito",
+        tag_counts,
+        {},
+        min_tag_count=100,
+        max_ngram=2,
+    )
+    assert_equal(phrases, ["red_fox", "burrito"], "2-gram should suppress its component 1-grams")
+def test_alias_resolution_uses_target_count_floor():
+    tag_counts = {
+        "hotdog": 150,
+        "low_count_tag": 99,
+    }
+    alias2tags = {
+        "hot_dog": ["hotdog"],
+        "rare_alias": ["low_count_tag"],
+    }
+    phrases = extract_exact_tag_query_phrases(
+        "A hot dog and rare alias",
+        tag_counts,
+        alias2tags,
+        min_tag_count=100,
+        max_ngram=2,
+    )
+    assert_equal(phrases, ["hot_dog"], "alias phrase should emit only when a target clears min count")
+def test_caption_prefix_is_ignored():
+    tag_counts = {"caption": 1000, "red_fox": 300}
+    phrases = extract_exact_tag_query_phrases(
+        "caption_to_tags: red fox",
+        tag_counts,
+        {},
+        min_tag_count=100,
+        max_ngram=2,
+    )
+    assert_equal(phrases, ["red_fox"], "task prefix should not contribute tag query phrases")
+def test_real_assets_find_burrito_and_retrieve_it():
+    from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases
+    from psq_rag.retrieval.state import get_alias2tags, get_tag_counts
+    tag_counts = get_tag_counts()
+    phrases = extract_exact_tag_query_phrases(
+        "A red fox eating a giant burrito",
+        tag_counts,
+        get_alias2tags(),
+        min_tag_count=100,
+        max_ngram=2,
+    )
+    assert_in("red_fox", phrases, "real asset extraction should find red_fox")
+    assert_in("burrito", phrases, "real asset extraction should find burrito")
+    candidates = psq_candidates_from_rewrite_phrases(
+        rewrite_phrases=phrases,
+        allow_nsfw_tags=False,
+        min_tag_count=100,
+        per_phrase_k=10,
+        per_phrase_final_k=1,
+        global_k=300,
+    )
+    tags = {candidate.tag for candidate in candidates}
+    assert_in("burrito", tags, "exact burrito query phrase should retrieve burrito")
+def main():
+    test_longest_match_suppresses_component_unigrams()
+    test_alias_resolution_uses_target_count_floor()
+    test_caption_prefix_is_ignored()
+    test_real_assets_find_burrito_and_retrieve_it()
+    print("exact tag query phrase tests: PASS")
+if __name__ == "__main__":
+    main()