Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

App Files Files Community

Food Desert commited on Mar 7

Commit

c6be992

1 Parent(s): 4fdda86

Add alias-based character tag filtering for Stage 3

Browse files

Files changed (40) hide show

.gitignore +12 -0
.vscode/launch.json +13 -0
.vscode/settings.json +6 -0
AGENTS.md +20 -0
ConvertSampleImagesToJpeg.ipynb +147 -0
Prompt_Squirrel_RAG.code-workspace +8 -0
README.md +14 -14
SamplePrompts.csv +10 -0
TagDocumentation.txt +319 -0
app.py +293 -0
data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000.jsonl +0 -0
docs/retrieval_contract.md +197 -0
docs/rewrite_contract.md +141 -0
docs/stage3_contract.md +170 -0
e621naturallanguagedataset.txt +140 -0
fluffyrock_3m.csv +0 -0
mascotimages/transparentsquirrel.png +3 -0
predict_all_tags_from_dump.ipynb +721 -0
psq_rag/__init__.py +0 -0
psq_rag/llm/__init__.py +0 -0
psq_rag/llm/openrouter_client.py +121 -0
psq_rag/llm/rewrite.py +67 -0
psq_rag/llm/select.py +711 -0
psq_rag/parsing/__init__.py +0 -0
psq_rag/parsing/prompt_grammar.py +60 -0
psq_rag/pipeline/__init__.py +0 -0
psq_rag/pipeline/preproc.py +36 -0
psq_rag/retrieval/__init__.py +0 -0
psq_rag/retrieval/psq_retrieval.py +500 -0
psq_rag/retrieval/state.py +398 -0
requirements.txt +13 -0
scripts/extract_tag_patterns.py +272 -0
scripts/rewrite_playground.py +130 -0
scripts/sample_dataset_streaming.py +68 -0
scripts/smoke_test.py +159 -0
scripts/stage3_debug.py +359 -0
scripts/test_alias_filter.py +304 -0
transparentsquirrel.png +3 -0
wiki_pages-2023-08-08.csv +3 -0
word_rating_probabilities.csv +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+.venv/
+__pycache__/
+*.pyc
+*.log
+*.tmp
+.DS_Store
+.env
+zout.txt
+tf_idf_files_420.joblib
+e621FastTextModel010Replacement_small.bin
+tfidf_hnsw_artists.bin
+tfidf_hnsw_tags.bin

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "Run app.py",
+      "type": "python",
+      "request": "launch",
+      "program": "${workspaceFolder}/app.py",
+      "console": "integratedTerminal",
+      "envFile": "${workspaceFolder}/.env"
+    }
+  ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "python.defaultInterpreterPath": ".venv/Scripts/python.exe",
+  "python.analysis.typeCheckingMode": "basic",
+  "python.analysis.autoImportCompletions": true,
+  "python.analysis.extraPaths": ["."]
+}

AGENTS.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# Codex Instructions (Prompt_Squirrel_RAG)
+## Environment (Windows / PowerShell)
+- Always run from the repo root.
+- Never run `python` or `pip` directly.
+- Always use the venv interpreter:
+  - `.venv\Scripts\python.exe`
+- Install deps with:
+  - `.venv\Scripts\python.exe -m pip install -r requirements.txt`
+## Change discipline
+- Keep diffs small: fix one issue or implement one focused step per patch.
+- Do not rewrite large files.
+- Do not move logic across modules unless the contract requires it.
+- Preserve stage boundaries: rewriting (LLM) vs retrieval (candidate generation) vs selection (index-only).
+## Project contracts
+- Follow the retrieval grounding / candidate generation contract:
+  - `docs/retrieval_contract.md`
+- If behavior conflicts with existing code, update code to match the contract (not the other way around).

ConvertSampleImagesToJpeg.ipynb ADDED Viewed

	@@ -0,0 +1,147 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4aa04654",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "098e115f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import os\n",
+    "import json\n",
+    "from PIL import Image\n",
+    "from sd_parsers import ParserManager\n",
+    "\n",
+    "# Directory with PNG images\n",
+    "image_directory = 'E:/image/holder/Tagset_Completer/sampleimages/02landscape'\n",
+    "\n",
+    "# Initialize the ParserManager\n",
+    "parser_manager = ParserManager()\n",
+    "\n",
+    "# Dictionary for artist names to corresponding JPG file names\n",
+    "artist_to_file_map = {}\n",
+    "\n",
+    "# Iterate through PNG files in the directory\n",
+    "for png_file in glob.glob(os.path.join(image_directory, '*.png')):\n",
+    "    with Image.open(png_file) as img:\n",
+    "        # Extract metadata using ParserManager\n",
+    "        prompt_info = parser_manager.parse(img)\n",
+    "        if prompt_info and prompt_info.prompts:\n",
+    "            first_prompt_text = list(prompt_info.prompts)[0].value.split(',')[0].strip()\n",
+    "            if first_prompt_text.startswith(\"by \"):\n",
+    "                first_prompt_text = first_prompt_text[3:]  # Remove \"by \" prefix\n",
+    "            artist_to_file_map[first_prompt_text] = os.path.basename(png_file).replace('.png', '.jpg')\n",
+    "        else:\n",
+    "            artist_to_file_map[\"\"] = os.path.basename(png_file).replace('.png', '.jpg')\n",
+    "\n",
+    "# Save the mapping to a JSON file in the same directory\n",
+    "json_path = os.path.join(image_directory, 'artist_to_file_map.json')\n",
+    "with open(json_path, 'w') as json_file:\n",
+    "    json.dump(artist_to_file_map, json_file, indent=4)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "ac5cba7f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Iterate through PNG files in the directory\n",
+    "for png_file in glob.glob(os.path.join(image_directory, '*.png')):\n",
+    "    # Open the image\n",
+    "    with Image.open(png_file) as img:\n",
+    "        # Convert the image to RGB mode in case it's RGBA or P mode\n",
+    "        img = img.convert('RGB')\n",
+    "        # Define the output filename replacing .png with .jpg\n",
+    "        jpg_file = png_file.rsplit('.', 1)[0] + '.jpg'\n",
+    "        # Save the image in JPG format\n",
+    "        img.save(jpg_file, 'JPEG')\n",
+    "        # Optionally, remove the original PNG file\n",
+    "        os.remove(png_file)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32bfb9cc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3648a9fc",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09f74cbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d2e18c17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "354fda37",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac4e5911",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Prompt_Squirrel_RAG.code-workspace ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+	"folders": [
+		{
+			"path": "."
+		}
+	],
+	"settings": {}
+}

README.md CHANGED Viewed

@@ -1,14 +1,14 @@
----
-title: Prompt Squirrel RAG
-emoji: 📚
-colorFrom: pink
-colorTo: indigo
-sdk: gradio
-sdk_version: 6.5.1
-app_file: app.py
-pinned: false
-license: mit
-short_description: RAG interface for Prompt Squirrel
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Prompt Squirrel
+emoji: 🐿️
+colorFrom: gray
+colorTo: gray
+sdk: gradio
+sdk_version: 5.43.1
+python_version: 3.10.12
+app_file: app.py
+pinned: false
+license: apache-2.0
+tags:
+- not-for-all-audience
+---

SamplePrompts.csv ADDED Viewed

	@@ -0,0 +1,10 @@

+Prompts:,,,,
+name,source,description,prompt,negative
+soyjak,drhead,simple prompt shows styles more intensely,"by artist, soyjak, anthro, male, bust portrait, meme, grin",
+landscape,https://e621.net/posts/320878,tags from a landscape featuring no characters,"by artist, amazing background, cliff, cloud, crystal, detailed background, fantasy, forest, grass, high-angle view, horizon, landscape, monument, mountain, nature, not furry, outside, plant, plateau, river, rock, scenery, scenery porn, sculpture, sky, spikes, statue, tower, tree, water, waterfall, wood, zero pictured",nsfw
+goat,https://e621.net/posts/2741820,tags from a high-scoring image featuring a male furry,"by artist, bovid, caprine, goat, mammal, angry, anthro, bar emanata, bell, bell collar, blush, child, collar, cowbell, daww, emanata, fur, hair, horizontal pupils, horn, male, nude, open mouth, orange eyes, pupils, red collar, simple background, solo, square pupils, tongue, unusual pupils, white body, white fur, white hair, young, young anthro",nsfw
+,,,,
+,,,,
+,,,,
+Artists,,,,
+{by grypwolf|by evilymasterful|by domovoi lazaroth|by krazyelf|by bzeh|by harmarist|by doxy|by cervina7 \(artist\)|by nastycalamari|by tyroo|by secretly saucy|by siroc|by jrjresq|by stylusknight|by raccoondouglas|by furlana|by slimefur|by aycee|by ncs|by areye \(artist\)|by devo87|by youjomodoki|by qupostuv35|by seraziel|by juiceps|by dezz|by sligarthetiger|by scafen \(artist\)|by brolaren|by ro|by 0r0ch1|by zeta-haru|by glacierclear|by kluclew|by feretta|by the gentle giant|by pata|by raikissu|by f-r95|by wolfy-nail|by darkenstardragon|by tokifuji|by flamespitter|by twinkle-sez|by aennor|by dangpa|by twistedscarlett60|by neelix|by scruffythedeer|by frenky hw|by hladilnik|by quotefox|by w4g4|by ancesra|by tzarvolver|by wolflong|by katahane3|by saurian \(artist\)|by ittybittykittytittys|by km-15|by nawka|by utopianvee|by anchee|None|by darkgem|by joaoppereiraus|by kittydee|by monkeyspirit|by tailzkim|by sidnithefox|by killioma|by cyancapsule|by asnnonaka|by skidoo|by iwbitu|by shadman|by luccatoasty|by re-sublimity-kun|by hyilpi|by sepulte|by cumbread|by sususuigi|by r3drunner|by jailbird|by agitype01|by chikaretsu|by lonbluewolf|by rick griffin|by euyoshi89|by cold-blooded-twilight|by domasarts|by katarhein|by fivel|by nextel|by negger|by mcfli|by gekasso|by anglo|by securipun|by zeiro|by cocoline \(artist\)|by lizardlars|by sabrotiger|by dripponi|by krokobyaka|by type|by bastionshadowpaw|by amberpendant|by chromapan|by buta99|by demicoeur|by alfa995|by spuydjeks|by spirale|by shaolin bones|by seth-iova|by complextree|by freckles \(artist\)|by angiewolf|by glopossum|by aoizuri|by inuzu|by zourik|by manmosu marimo|by sijimmy456|by zummeng|by mleonheart|by macaronneko|by pache riggs|by kanashiipanda|by smileeeeeee|by sicklyhypnos|by diacordst|by haychel|by zawmg|by orionsmaniac \(artist\)|by vhkansfweer|by tsampikos|by johnfoxart|by zp92|by gammainks|by gerrkk|by aomori|by kionant|by kanel|by tattoorexy|by mcfan|by sepiruth|by clockhands|by carpetwurm|by capaoculta|by miles df|by sana!rpg|by carrot \(artist\)|by inno-sjoa|by raptoral|by thericegoat|by iriedono|by acstlu|by rov|by glitter trap boy|by redrusker|by ldr|by frumples|by nikraccoom|by mystikfox61|by haaru|by ketei|by somik|by zinfyu|by jinu|by zoyler|by rotten robbie|by nurinaki|by sincrescent|by bonnie bovine|by cooliehigh|by s1m|by dash ravo|by jakethegoat|by claweddrip|by 007delta|by jizoku|by personalami|by marblesoda|by dagasi|by chrysalisdraws|by marik azemus34|by nnecgrau|by atrolux|by slugbox|by imgonnaloveyou|by snowskau|by drmax|by lazysnout|by xennos|by oro97|by dark violet|by eternity-zinogre|by nepentz|by rysonanthrodog|by sigma x|by omega56|by letodoesart|by skully|by delki|by ratatooey|by codyblue-731|by honeycalamari|by saltyxodium|by fleet-foot|by ashraely|by cobaltsynapse|by edjit|by twang|by etheross|by chelodoy|by shinodage|by dlw|by twiren|by ssssnowy|by nikkibunn|by backsash|by syuro|by zaush|by skeleion|by chunie|by butterchalk|by loimu|by seibear|by r-mk|by cobalt snow|by braeburned|by eldiman|by einshelm|by trigaroo|by eto ya|by gewitter|by wizzikt|by hyattlen|by coffeesoda|by photonoko|by woolrool|by jarnqk|by nuzzo|by inu-sama|by ruaidri|by jishinu|by merrunz|by hioshiru|by thousandfoldfeathers|by desertkaiju|by kakhao|by xeono|by b-epon|by nexivian|by smiju|by captainzepto|by meesh|by catcouch|by sorc|by ajin|by rajii|by tofu froth|by sagaris uwu|by burgerkiss|by black-kitten|by kawfee|by lizet|by berseepon09|by sssonic2|by backlash91|by doomthewolf|by arbuzbudesh|by k 98|by picturd|by rayka|by soulcentinel|by adelaherz|by babywife|by stargazer|by elicitie|by rakisha|by kuroodod|by discordthege|by the-minuscule-task|by rainbowscreen|by skygracer|by lynncore|by itsunknownanon|by goonie-san|by kekitopu|by ultrabondagefairy|by mawmain|by hoodie \(artist\)|by truegrave9|by modca|by stoopix|by fumiko|by patto|by iskra|by the crab mage|by narse|by zero-sum|by digitoxici|by abesdrawings|by yuio|by zhanbow|by avante92|by hinar miler|by kikurage|by raaz|by romarom|by iztli|by unknown artist|by foxovh|by dimwitdog|by miso souperstar|by totesfleisch8|by keadonger|by piporete|by valkoinen|by jay-r|by thesecretcave|by smitty g|by pixelsketcher|by youwannaslap|by seff|by sicmop|by dragonfu|by magnetus|by chloe-dog|by alibi-cami|by bonifasko|by dankflank|by pakwan008|by deymos|by viejillox|by lysergide|by metal \(artist\)|by vader-san|by lockworkorange|by prsmrti|by halbean|by naive tabby|by shoutingisfun|by kiyosan|by daftpatriot|by gothbunnyboy|by anonymous artist|by hark|by phenyanyanya|by tsudamaku|by koorinezumi|by natoli|by jackaloo|by boo3|by tfancred|by nana gel|by reddragonkan|by flinters|by amegared|by markie|by nishi oxnard|by chrisandcompany|by triadfox|by dlrowdog|by hentai boy|by lizheru|by buzzer \(artist\)|by satsumalord|by pasaran|by foxfoxplz|by blpanda|by babystar|by yantaro keno|by renee-moonveil|by 9x9|by tombola1993|by raptor007|by chaostone|by cooner|by mt tg|by ficficponyfic|by sarcolopter|by azumaril|by dreadwolfclaw1990|by bigshow|by fierglief|by bobert|by zeriara|by mac-daddy|by dragmon|by jbond|by trevor-fox|by parclytaxel|by kusosensei|by gyrotech|by itoruna|by a.b. lust|by superbunnygt|by doneru|by box xod|by lefthighkick|by uniparasite|by malicekira|by mizzyam|by vrabo|by sacrificabominat|by zer0rebel4|by rikitoka|by karabiner|by fredryk phox|by mot|by rairai-no26-chu|by citrinelle|by jrvanesbroek|by makarimorph|by torakuta|by 1boshi|by skyelegs|by kanada|by darkdoomer|by smudge proof|by riorix|by kitchiki|by bristol|by fuze|by dirtyscoundrel|by foxball|by badumsquish|by ken sugimori|by lovelesskiax|by ricky hoffman|by buta5kawa|by roobin|by grumpy griffin creations|by mastergodai|by imperatorcaesar|by lagotrope|by ichthy0stega|by dark-moltres|by smutbooru|by deanwolfwood|by kamui shirow|by koraru-san|by foxenawolf|by caramelcraze|by date natsuku|by cotton \(artist\)|by catmonkshiro|by julius zimmerman|by hitec|by snow utamaru|by ottahz|by ryuko rose|by takagi kyou|by ka-samy|by ittybittyshark|by dynoex|by hatake|by kraken \(artist\)|by ruthredmane|by cybercat|by honesty \(artist\)|by freeze-pop88|by kinoshita-jiroh|by sobieniak|by viroveteruscy|by kelly hamilton|by pembrokewkorgi|by hinami|by kick \(artist\)|by train \(artist\)|by mind drive|by ayaka|by harpseal|by ukisudori|by inunoshippo|by sikai|by jamminbison|by artsy-theo|by marco fanjul|by wolfmalro|by positive wishes \(artist\)|by schwartzgeist|by utsuki maito|by bunnie love|by mulefoot|by chris goodwin|by poge jirushi|by thegreatmatsutzu|by sachiel 666|by inkyfrog|by dtalvi|by rorr|by fab3716|by rex equinox|by navitaserussirus|by rousemouse|by bitterplaguerat|by dannyg|by sbshouseofpancakes|by slb|by edgar rice burroughs|by doug winger|by maxime-jeanne|by rocket grunt \(artist\)|by usuario2 \(artist\)|by mauroz|by sailoranna|by tatwuyan|by tkc2021|by misterdonn|by tanutronik753 k|by namagakiokami|by emufu|by suishou0602|by macop|by bakukurara|by oogamikennta|by tigerlilylucky|by mike sherman|by snowfyre|by mylafox|by kitfox-crimson|by arania|by selinatc|by toshi \(artist\)|by mofuaki|by pokefound|by delirost|by galacticmichi|by doost|by trixythespiderfox|by darkmirage|by aogami|by meraence|by isolatedartest|by nottrevbe|by nsfwzhenya|by fourball|by manene|by trinity-fate62|by kilinah|by ingi|by latchk3y|by pochincoff|by welost|by skipsy|by bunnybits|by lunalei|by yousan|by kaynine|by honovy|by dream and nightmare|by wugi|by viskasunya|by faejunkie|by v-tal|by sabuky|by faeki|by kammi-lu|by foxes in love|by nightfaux|by virtyalfobo|by peculiart|by rika|by marsminer|by discreet user|by marshmallow-ears|by aeonspassed|by dreiker|by lyme-slyme|by punkypanda|by ponporio \(artist\)|by sonsasu|by kame 3|by pururing|by wbnsfwfactory|by bikupan|by bigdon1992|by lichfang|by bakemonoy|by b-ern|by merunyaa|by redishdragie|by lightsource|by enigi09|by hanuvo|by justmegabenewell|by thefuckingdevil|by minnosimmins|by qwertydragon|by fakeryway|by cotora|by ark warrior|by danomil|by avoid posting|by kostos art|by ratcha|by atryl|by fuf|by lvlirror|by theboogie|by nitani|by roly|by aer0 zer0|by hardyboy|by nozomyarts|by sinsquared|by cherrikissu|by asaneman|by tfzn|by hooves-art|by catsudon|by bigcozyorca|by mr.smile|by sinensian|by nukochi|by felino|by toto draw|by mytigertail|by arrwulf|by oselotti|by gorsha pendragon|by laser \(artist\)|by doesnotexist|by nekowuwu|by alanscampos|by el-loko|by compfive|by komdog|by magenta7|by milachu92|by serex|by bigdad|by aaron \(artist\)|by diadorin|by pig \(artist\)|by slickerwolf|by angstrom|by kihu|by ike marshall|by chalo|by furball \(artist\)|by lavenderpandy|by hunterramirez|by kloudmutt|by jerseydevil|by zi ran|by moreuselesssource|by ocaritna|by rukifox|by tggeko|by kiseff|by e254e|by princelykaden|by artdecade|by inuki|by prrrrrrmine|by chewycuticle|by haps|by senz|by argento|by daigaijin|by falcrus|by omari|by risenpaw|by satsukii|by lollipopcon|by ralek|by kyrosh|by tush|by reccand|by sindoll|by zerofox1000|by kaboozey|by somescrub|by yurusa|by limebreaker|by keffotin|by matemi|by uromatsu|by roadiesky|by saku1saya|by knightmoonlight98|by zerolativity|by winick-lim|by harnny|by girlsay|by sukebepanda|by sparrow \(artist\)|by amazinggwen|by slug \(artist\)|by smoothlabs|by eleacat|by replica \(artist\)|by thewill|by kevinsano|by feliscede|by james howard|by moki|by skylardoodles|by hyucaze|by lumineko|by conditional dnp},,,,

TagDocumentation.txt ADDED Viewed

	@@ -0,0 +1,319 @@

+tag what you see (locked)
+Regardless of what you know from outside sources, only tag what you can see in the image.
+Also, make sure you check out e621:Tag What You See (Explained) for the reasoning behind the TWYS policy.
+Unlike many other art sites, e621 has a tagging policy called "Tag What You See", or TWYS for short.
+TWYS states that all General category tags on a post must be directly evident from within the post itself. TWYS applies only to visual elements within a post, such as objects, characters, and the actions taken by characters that are visible. Audio content is not tagged, except in the Meta category.
+For example, a solo picture of a character who appears male must be tagged male.
+That remains true even if the artist or the character owner themselves state that the character is not male, or if text within the image states that the character is not male. These tags refer strictly to a character's outward appearance and nothing more.
+This policy exists to make search results more predictable and objective.
+Note that you can use Lore tags to describe the stated genders of the characters, rather than the visible ones.
+Tags in other categories are not entirely subject to the Tag What You See principle:
+    Tags in the Lore category are meant to convey the artist's intentions or other background information that cannot be reliably determined via TWYS, such as gender identity or familial relations.
+    Tags in the Character and Species categories are partially dependent upon TWYS: that is, external information can be used to help identify what character or species is supposed to be depicted in the post in cases where it isn't obvious, but it cannot actively conflict with what is seen in the post. For example, you can tag character a if the artist claims that a disembodied hand in the post belongs to character a, unless the hand looks nothing like character a and instead looks exactly like it belongs to character b. In that case, TWYS overrides the artist's word.
+    Tags in other categories are valid if the information that they convey is objectively true, such as the artist's name, the image's aspect ratio, or the IP holder of the characters in the post. For MP4, WebM, and Flash posts, audio-related tags may be included in the Meta category, but only to the extent of describing the presence and type of audio in the post (see the sound article for more information).
+There will be times when it's still not clear what tags should be applied to an image. An administrator should be contacted to help resolve such cases.
+Leeway may be given to hybrid characters, as the components of the species by which they are comprised are not always obvious.
+Note: tag_what_you_see is not a tag to be used. If a post is contains this tag, please remove it.
+See also
+    Help with tags
+    How to tag genders
+    Overly Specific
+    Tag What You See (Explained)
+    Tagging Checklist
+Posts (view all)
+Nobody here but us chickens!
+###
+ e621:tag what you see (explained) (locked)
+[Back: e621:index]
+The text below is intended to be a sort of "introduction" to e621's Tag What You See policy. The text below is NOT the policy itself, which you can view here: Tag What You See
+Reading and understanding the TWYS policy is extremely important if you intend on editing tags on posts at all, so please make sure you read the policy itself as well as this introduction.
+The Policy
+A brief summary of what the TWYS policy is:
+Unlike many other art sites, e621.net has a tagging policy called "Tag What You See" (aka: "TWYS"). With very few exceptions, TWYS says that all tags on a post must be directly verifiable within the post itself. Example: a solo picture of what APPEARS to be a male character will be tagged "male". Even if the character was defined as "female" on other sites by the artist or character owner themselves, the picture would still need to be tagged "male" on e621, because of the TWYS policy.
+This may seem unusual and even insensitive, but please read on to understand why the site functions this way.
+The Debate
+The dispute between "Tag What You See" and "Tag What You Know"
+The Reasons
+There are several reasons for the necessity of the TWYS policy.
+The Problems
+Of course, no method of tagging is perfect, and there are a few problems that tend to arise as a result of using TWYS:
+    Sometimes users are just going to disagree over what is "seen" in a post or not. This is simply an expected consequence of having a TWYS policy. These situations will often need intervention from an administrator in order to resolve.
+    Gender tags (male, female, herm, etc) are typically at the heart of most TWYS debates. The reasons for this are numerous, but it boils down to A) artists drawing characters in ways that make it difficult to determine gender, and B) characters designed in such a way that they can easily appear to be either one gender or another (e.g. a herm wearing clothes typically looks just female). Again, there's nothing "wrong" with doing this, but it undoubtedly leads to confusion and people getting the wrong ideas if the artwork is ever viewed by itself. Again, e621 currently is interested only in a character's APPARENT gender, not their DEFINED gender. But sometimes even the apparent gender isn't obvious; in these cases, an administrator will need to make the final decision.
+Tip:
+To quickly link other e621 users to this page, simply type [[twys]] in your message. Example: "Check out twys for an explanation of the TWYS rule."
+[Back: e621:index]
+###
+ e621:tagging checklist (locked)
+[Back: e621:index]
+This is an informal and unofficial supplement to the tagging rules and guidelines, meant to encourage better and more complete tagging.
+Make sure you're also familiar with our Tag What You See policy before editing tags: tag_what_you_see for the policy itself, and e621:Tag What You See (Explained) for a more in-depth explanation why we use TWYS.
+Each entry below poses a general question about a post, with some example tags that answer it. A good post will probably have most of these answered (but not necessarily all).
+Basics
+Tags that all posts should have, to maintain minimal searchability.
+    Artist(s)? Use their best known alias. If a picture has more than one artist, tag them all, along with collaboration. If you're not sure who the artist is, tag unknown_artist. If the artist wishes to remain anonymous, use anonymous_artist instead.
+    Rating?
+        Explicit for fully or partially exposed genitalia (penis, pussy, cloaca, sheath, balls, or anus), various sex acts even if no genitalia are visible, high amounts of violence/gore, sexual fluids such as cum or pussy_juice, and extreme sexual fetishes such as scat, watersports, or BDSM.
+        Safe for anything that can be viewed in public without much uproar: no genitals, no sexual overtones or poses, no realistic violence, or any questionable activity.
+        Questionable for everything in between, such as topless females and suggestive poses.
+            For more help on ratings please see e621: Ratings
+    Copyright? The original series or company a character or game is owned by.
+    Character? Tag the character's best known name. If not that, their full name.
+    Body type? anthro, feral, humanoid, taur, anthrofied (pokemorph, digimorph), ponified, feralized
+    Species? human, canine, feline, bovine, cervine, equine, lagomorph, rodent, avian, insect, marine (cetacean, shark), scalie (click for detailed lists)
+    Sex/gender? male, female, intersex (herm, maleherm, gynomorph, andromorph), ambiguous_gender
+        See How To: Tag Genders for a detailed guide
+    How many? solo, duo, trio, group, zero_pictured
+    Clothing? fully_clothed, partially_clothed, skimpy, nude, bottomless, topless, underwear, open_shirt
+    Location? inside, outside, bedroom, kitchen, forest
+    Perspective? front_view, rear_view, side_view, three-quarter_view, low-angle_view, high-angle_view, worm's-eye_view, bird's-eye_view, first_person_view
+Sexually explicit
+    Male bits? penis, balls, sheath, knot, erection, half-erect, flaccid, humanoid_penis, equine_penis, tapering_penis, veiny_penis, uncut, circumcised
+    Female bits? pussy, clitoris, plump_labia, equine_pussy, canine_pussy
+    Other? butt, anus, puffy_anus, gaping_anus, urethra, genital_slit
+    Sex act? sex (male/female, female/female, male/male, bisexual), masturbation, handjob, footjob, fellatio, cunnilingus, vaginal_penetration, anal_penetration, threesome, foursome, orgy, gangbang, frottage, tribadism, orgasm, cum_inside
+    Position? Common ones: missionary_position, cowgirl_position, reverse_cowgirl_position, from_behind, 69_position, stand_and_carry_position.
+        See also: tag group:sex positions
+    Sexual themes? bondage, domination, rape, rough_sex, happy_sex, presenting, internal, impregnation, bestiality, interspecies, public, exhibitionism
+    Fluids? cum, cumshot, precum, pussy_juice, pussy_ejaculation, saliva
+    Toys? dildo, vibrator, buttplug, egg_vibrator, strapon, feeldoe
+Pose / Activity / Appearance
+    General activity (if any)? walking, running, fighting, sleeping, dancing, eating, kissing, licking
+    Posture? standing, bent_over, sitting, crouching, kneeling, all_fours, on_front, on_side, on_back, ass_up (see tag group:pose for full list)
+    Body decor? glasses, ring, necklace, bracelet, anklet, tattoo, piercing, collar, hat
+    Fur style? mane, chest_tuft, pubes
+    Hair? hair, long hair, short hair
+    Breasts? breasts (small_breasts, big_breasts, huge_breasts), nipples, under_boob, side_boob, teats
+    Limbs? crossed_arms, raised_arms, arms_behind_head, spread_legs, crossed_legs, raised_leg, legs_up, raised_tail, tailwag
+    Gaze? looking_at_viewer, looking_back, eye_contact, eyes_closed
+    Expression? blush, wink, smile, grin, tongue_out, naughty_face, embarrassed, happy, sad
+Information and Requests
+    Quality/medium? sketch, line_art, monochrome, shaded, pencil_(artwork), watercolor, 3D, digital_media_(artwork)
+    Picture organization? comic, multiple_scenes, sequence, close-up, portrait, pinup, solo_focus, wallpaper
+    Style? toony, detailed, realistic
+    Text and languages? english_text, japanese_text, spanish_text, runes, dialogue, speech_bubble, symbol
+    Information? translated, partially_translated, unknown_artist_signature, not_furry, bigger version at the source
+    Requests? translation_request, source_request, tagme
+    Image size? low_res, hi_res, absurd_res, superabsurd_res
+    Year of creation? 2016, 2015, and so on
+Heavily vetted tags.
+Tags that can be found on our global blacklist, and heavily vetted tags MUST be added upon upload.
+    young, gore, scat, watersports, diaper, my little pony, vore, not furry, rape, hyper, feral, nazi, politics, zoophile iconography.
+    Everything pedophilia
+Do NOT tag
+    Subjective tags that express opinions. Common examples include beautiful, sexy, hot, good, crappy and most other adjectives. Subjective themes can be collected into a set instead. (See https://e621.net/help/sets )
+    Generic tags such as legs, eyes, big, image and organism.
+###
+Help: Tags
+← E621 Wiki – Tags
+Table of Contents
+    Guidelines
+    Categories
+        Artist
+        Contributor
+        Character
+        Copyright
+        Species
+        General
+        Meta
+        Lore
+        Invalid
+    Changing Tag Category
+Read More: Aliases | Implications | Bulk Update Requests
+      Search Cheatsheet
+Tags
+Tags are keywords that you can use to describe posts.
+They serve a dual purpose: they allow you to both find the content that you like, and to filter out stuff that you dislike.
+Tags may belong to various categories, and may interact with each other via relationships.
+See the cheatsheet for examples of the search syntax.
+↑ Guidelines
+When tagging a post, you must follow the following guidelines.
+Tag What You See
+Full article: Tag What You See.
+Unlike many other art sites, e621 has a tagging policy called "Tag What You See", or TWYS for short.
+TWYS states that all General category tags on a post must be directly evident from within the post itself.
+For example, a solo picture of a character who appears male must be tagged male.
+That remains true even if the artist or the character owner themselves state that the character is not male, or if text within the image states that the character is not male. These tags refer strictly to a character's outward appearance and nothing more.
+This policy exists to make search results more predictable and objective.
+Note that you can use Lore tags to describe the stated genders of the characters, rather than the visible ones.
+Tags in other categories are not entirely subject to the Tag What You See principle:
+    Tags in the Lore category are meant to convey the artist's intentions or other background information that cannot be reliably determined via TWYS, such as gender identity or familial relations.
+    Tags in the Character and Species categories are partially dependent upon TWYS: that is, external information can be used to help identify what character or species is supposed to be depicted in the post in cases where it isn't obvious, but it cannot actively conflict with what is seen in the post. For example, you can tag character a if the artist claims that a disembodied hand in the post belongs to character a, unless the hand looks nothing like character a and instead looks exactly like it belongs to character b. In that case, TWYS overrides the artist's word.
+    Tags in other categories are valid if the information that they convey is objectively true, such as the artist's name, the name of a voice actor, the image's aspect ratio, or the IP holder of the characters in the post.
+Minimum tag requirements
+Code of Conduct 2.2 - Tagging, Rating, and Sourcing Abuse
+All posts are expected to have at least ten general, non-implied tags upon upload. This refers to tags in the General tag category: Artist, Character, Species, Copyright, Lore, Meta, and Invalid tags do not count towards this requirement. "Non-implied" means that a tag which is added by implication from another tag does not count. For example, forest implies tree which implies plant. If you add the forest tag, both tree and plant will be added automatically. However, only the first tag counts towards the minimum tag requirement.
+This restriction will be eased if the post does not have ten distinct tags that are reasonably applicable to it. For example, extremely simplistic posts such as some zero pictured images may not depict enough to create ten tags.
+Contentious or objectionable content must always be tagged upon upload. This includes any strange, unusual, or extreme fetishes depicted within the post.
+Forbidden characters
+Tags may only contain English letters, numbers, and some symbols.
+No unicode characters, or characters belonging to languages other than English, may be used.
+The following characters are reserved for potential future uses.
+No new tags containing them can be created.
+    %,#\\*: anywhere in the tag
+    -~: as the first character
+Note that some existing tags already contain such characters.
+These tags predate the rule change, and will likely be phased out at some point in the future.
+↑ Categories
+There are eight categories (or "types") of tags on e621. They help to organize the many tags listed on this site and its many, many posts.
+This page will provide a quick rundown of what they are for and how to change the categories of tags from one to another.
+artist
+Arguably the most important tag on any post is the one that identifies the person who made the post itself.
+This (usually) isn't the e621 member who uploaded the post, a person who edited the post, and certainly not anyone who merely commissioned or requested the post.
+Artist tags are essential, as we maintain and respect an Avoid Posting List.
+If you are unable to identify the artist, then unknown_artist should be used. If the artist does not want to be identified, then anonymous_artist should be used instead.
+There are a few non-artist tags that are deliberately typed as "artists" in order to bring attention to them.
+    avoid_posting and its variant conditional_dnp tags identify artists with DNP or conditional DNP status
+    epilepsy_warning is used for flashing lights in animated, Flash, and video posts that could trigger epileptic seizures
+    sound_warning is for any loud sound playing in Flash and video posts
+        jumpscare_warning is for posts featuring loud sounds (typically screams) accompanied by unsettling or scary visuals.
+    unknown_artist_signature is for posts where there is an artist's signature on it, but the artist who made it could not be immediately identified
+contributor
+People who did not create the specific artwork in the post but who did provide creative contributions that are considered significant and essential to the artwork itself. (See topic #54179 for the discussion thread about this new category.)
+Currently, only two types of contributors are recognized for this category.
+    Voice actors, whose tags are suffixed with the disambiguation _(va).
+    Character modelers, whose tags are suffixed with the disambiguation _(modeler).
+Note that the primary artist(s) of a post are not to be tagged as contributor; they are still tagged as artists as normal. If the artist is also the modeler, they are to be tagged as just an artist; modeler tags are to be used if they created or provided a character model but did not provide the composition of the post. Likewise, if the artist of a video post voice acted for their own video, they still don't get a separate contributor tag.
+character
+Any identifiable fictional or real world individual who can be seen in a post, even if they're not actually "there".
+A statue or a kigurumi modeled after a character, or the cover of a solo music artist's album, would still be tagged as their corresponding characters
+Characters can range from mere fursonas to globally famous copyrighted characters like Mickey Mouse, Bugs Bunny, and Mario. Fan characters are also covered here.
+If you cannot identify a character, but you do know that they either are owned by someone or come from the real world, then unknown_character should be used.
+copyright
+Any recognizable brands and franchises (as well as the companies who own them) that can be identified through the use of their characters, settings, or other recognizable elements.
+Parodies of copyrights are also tagged with the copyrights that a post is parodying. Specific holidays like Christmas, Easter, and Halloween are also given copyright status.
+The real world is also a copyright tag, for what it's worth.
+species
+The bread and butter tags of this curated furry image archive, covering many real and fictional creatures.
+Cats, dogs, horses, fish, scalies, aliens, robots, spirits, Pocket Monsters, Digital Monsters, regular monsters, and the dreaded but mostly harmless humans are among the many kinds of creatures that you can find here.
+If you can't properly identify a species, then there are two tags you can use: unknown_species for creatures with identifying features, and ambiguous_species for creatures that cannot be determinable at all.
+general
+These plain-colored tags are for anything else that don't fit with any of the aforementioned four categories. Genders, objects, distinguishing features, locations, fetishes, sexual positions, sexual acts, and so on.
+New tags are automatically categorized as general tags. Artists, contributors, characters, copyrights, and species that haven't been properly re-typed to such yet are most likely typed as general tags as well.
+meta
+Tags that describe facts about the image itself, rather than what's in it, are placed in the meta category.
+Some of these tags are added automatically, like hi_res. Others, like 16:9 or 1:1 are added by dedicated bots.
+Tags describing what year the image was made also belong in this category, from 2025 all the way back to 6th_century_bc.
+lore
+Unlike other categories, lore tags are entirely outside the realm of TWYS. Instead, lore tags provide information that is either incorrect when following TWYS, or simply cannot be confirmed visually in the image itself, yet still relevant to the post.
+Keep in mind that standards TWYS tags should still be used where applicable. Lore tags do not replace them.
+Whenever a submission must be tagged as something that is "wrong", a lore tag should be added to provide the correct information.
+The most common use for lore tags is to correct gender tags – for example, a post that is tagged gynomorph might also need a herm_(lore) tag if that's what the character is, despite there not being any evidence of that in the image itself.
+Conversely, some fetish tags (like incest) cannot always be definitively confirmed through the image itself, and thus belong in the lore category.
+New lore tags can be requested on the forums.
+invalid
+Some tags are too ambiguous or broad to be useful, so they are placed in the invalid category.
+They should be replaced with better-fitting or more specific tags.
+Please, do not simply remove invalid tags without fixing the issue.
+###

app.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import gradio as gr
+import os
+import logging
+from PIL import Image
+from pathlib import Path
+from typing import List
+from psq_rag.pipeline.preproc import extract_user_provided_tags_upto_3_words
+from psq_rag.llm.rewrite import llm_rewrite_prompt
+from psq_rag.retrieval.psq_retrieval import psq_candidates_from_rewrite_phrases, _norm_tag_for_lookup
+from psq_rag.llm.select import llm_select_indices
+def _split_prompt_commas(s: str) -> List[str]:
+    return [p.strip() for p in (s or "").split(",") if p.strip()]
+def _norm_for_dedupe(tag: str) -> str:
+    # your canonical form for lookup/dedupe
+    return _norm_tag_for_lookup(tag.lower())
+def compose_final_prompt(rewritten_prompt: str, selected_tags: List[str]) -> str:
+    parts = _split_prompt_commas(rewritten_prompt)
+    parts.extend(selected_tags)
+    seen = set()
+    out = []
+    for p in parts:
+        key = _norm_for_dedupe(p)
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(p)
+    return ", ".join(out)
+# Set up logging
+# Minimal prod logging: warnings+ to stderr, no file by default
+import os, logging
+LOG_LEVEL = os.environ.get("PSQ_LOG_LEVEL", "WARNING").upper()
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL, logging.WARNING),
+    format="%(asctime)s %(levelname)s:%(message)s",
+    handlers=[logging.StreamHandler()]  # no file -> avoids huge logs on Spaces
+)
+# Quiet down common noisy libs (optional)
+for _name in ("gensim", "gradio", "hnswlib", "httpx", "uvicorn"):
+    logging.getLogger(_name).setLevel(logging.ERROR)
+# Turn off Gradio analytics phone-home to avoid those background thread errors (optional)
+os.environ["GRADIO_ANALYTICS_ENABLED"] = "0"
+MASCOT_DIR = Path(__file__).parent / "mascotimages"
+MASCOT_FILE = MASCOT_DIR / "transparentsquirrel.png"
+try:
+    from gradio_client import utils as _gc_utils
+    _orig_get_type = _gc_utils.get_type
+    _orig_j2p = _gc_utils._json_schema_to_python_type
+    _orig_pub = _gc_utils.json_schema_to_python_type
+    def _get_type_safe(schema):
+        # Sometimes schema is a bare True/False (JSON Schema boolean form)
+        if not isinstance(schema, dict):
+            return "any"
+        return _orig_get_type(schema)
+    def _j2p_safe(schema, defs=None):
+        # Accept non-dict schemas (True/False/None) and treat as "any"
+        if not isinstance(schema, dict):
+            return "any"
+        return _orig_j2p(schema, defs or schema.get("$defs"))
+    def _pub_safe(schema):
+        # Public wrapper used by Gradio; keep it resilient too
+        if not isinstance(schema, dict):
+            return "any"
+        return _j2p_safe(schema, schema.get("$defs"))
+    _gc_utils.get_type = _get_type_safe
+    _gc_utils._json_schema_to_python_type = _j2p_safe
+    _gc_utils.json_schema_to_python_type = _pub_safe
+except Exception as e:
+    print("gradio_client hotfix not applied:", e)
+# -------------------------------------------------------------------------------
+allow_nsfw_tags = False
+verbose_retrieval = True
+verbose_retrieval_all = False
+verbose_retrieval_limit = 20
+css = """
+.scrollable-content{
+  max-height: 420px;
+  overflow-y: scroll;          /* always show scrollbar */
+  overflow-x: hidden;
+  padding-right: 8px;
+  padding-bottom: 14px;   /* <— add this */
+  scrollbar-gutter: stable;    /* prevent layout shift as it fills */
+  /* Firefox */
+  scrollbar-width: auto;
+  scrollbar-color: rgba(180,180,180,.9) rgba(0,0,0,.15);
+}
+/* WebKit/Chromium (Chrome/Edge/Safari) */
+.scrollable-content::-webkit-scrollbar{ width: 10px; }
+.scrollable-content::-webkit-scrollbar-thumb{ background: rgba(180,180,180,.9); border-radius: 8px; }
+.scrollable-content::-webkit-scrollbar-track{ background: rgba(0,0,0,.15); }
+/* (Optional) make both scroll panes taller so they fill more of the column */
+.pane-left  .scrollable-content,
+.pane-right .scrollable-content {
+  max-height: 610px;                /* was 420px; tweak to taste */
+}
+"""
+def rag_pipeline_ui(user_prompt: str):
+    logs = []
+    def log(s): logs.append(s)
+    try:
+        log("Start: received prompt")
+        prompt_in = (user_prompt or "").strip()
+        if not prompt_in:
+            return "Error: empty prompt", ""
+        log("Input:")
+        log(prompt_in)
+        log("")
+        user_tags = extract_user_provided_tags_upto_3_words(prompt_in)
+        log("Heuristically extracted user tags:")
+        if user_tags:
+            log(", ".join(user_tags))
+        else:
+            log("(none)")
+        log("")
+        log("Step 1: LLM rewrite")
+        rewritten = llm_rewrite_prompt(prompt_in, log)
+        log("Rewrite:")
+        log(rewritten if rewritten else "(empty)")
+        log("")
+        rewrite_for_retrieval = rewritten
+        if user_tags:
+            # keep them separate in logs, but allow them to help retrieval
+            rewrite_for_retrieval = (rewrite_for_retrieval + ", " + ", ".join(user_tags)).strip(", ").strip()
+        log("Step 2: Prompt Squirrel retrieval (hidden)")
+        try:
+            rewrite_phrases = [p.strip() for p in (rewrite_for_retrieval or "").split(",") if p.strip()]
+            retrieval_result = psq_candidates_from_rewrite_phrases(
+                rewrite_phrases=rewrite_phrases,
+                allow_nsfw_tags=allow_nsfw_tags,
+                global_k=300,
+                verbose=verbose_retrieval,
+            )
+            if isinstance(retrieval_result, tuple):
+                candidates, phrase_reports = retrieval_result
+            else:
+                candidates, phrase_reports = retrieval_result, []
+            log(f"Retrieved {len(candidates)} candidate tags")
+            if verbose_retrieval:
+                log(f"Total unique candidates: {len(candidates)}")
+                limit = None if verbose_retrieval_all else max(1, int(verbose_retrieval_limit))
+                for report in phrase_reports:
+                    phrase = report.get("normalized") or report.get("phrase") or ""
+                    lookup = report.get("lookup") or ""
+                    tfidf_vocab = report.get("tfidf_vocab")
+                    log(f"Phrase: {phrase} (lookup={lookup}) tfidf_vocab={tfidf_vocab}")
+                    rows = report.get("candidates", [])
+                    shown = rows if limit is None else rows[:limit]
+                    for row in shown:
+                          tag = row.get("tag")
+                          alias_token = row.get("alias_token")
+                          score_fasttext = row.get("score_fasttext")
+                          score_context = row.get("score_context")
+                          score_combined = row.get("score_combined")
+                          count = row.get("count")
+                          alias_part = ""
+                          if alias_token and alias_token != tag:
+                              alias_part = f" [alias_token={alias_token}]"
+                          fasttext_str = (
+                              f"{score_fasttext:.3f}" if isinstance(score_fasttext, (int, float)) else score_fasttext
+                          )
+                          if score_context is None:
+                              context_str = "None"
+                          else:
+                              context_str = (
+                                  f"{score_context:.3f}" if isinstance(score_context, (int, float)) else score_context
+                              )
+                          combined_str = (
+                              f"{score_combined:.3f}" if isinstance(score_combined, (int, float)) else score_combined
+                          )
+                          log(
+                              f"  {tag}{alias_part} | fasttext={fasttext_str} context={context_str} "
+                              f"combined={combined_str} count={count}"
+                          )
+                    if limit is not None and len(rows) > limit:
+                        log(f"  ... ({len(rows) - limit} more)")
+        except Exception as e:
+            log(f"Retrieval fallback: {type(e).__name__}: {e}")
+            candidates = []
+        log("Step 3: LLM index selection")
+        # We pass the original 'prompt_in' as the description for the LLM to match against
+        picked_indices = llm_select_indices(
+            query_text=prompt_in,
+            candidates=candidates,
+            max_pick=0,
+            log=log
+        )
+        selected_tags = [candidates[i].tag for i in picked_indices] if picked_indices else []
+        log("Step 4: Compose final prompt")
+        final_prompt = compose_final_prompt(rewritten, selected_tags)
+        log("Done: final prompt ready")
+        return "\n".join(logs), final_prompt
+    except Exception as e:
+        log(f"Error: {type(e).__name__}: {e}")
+        return "\n".join(logs), ""
+with gr.Blocks(css=css) as app:
+    with gr.Row():
+        with gr.Column(scale=3, elem_classes=["prompt-col"]):
+            image_tags = gr.Textbox(
+                label="Enter Prompt",
+                placeholder="e.g. fox, outside, detailed background, .",
+                lines=1
+            )
+        with gr.Column(scale=1):
+            _mascot_pil = Image.open(MASCOT_FILE).convert("RGBA")
+            mascot_img = gr.Image(
+                value=_mascot_pil,
+                show_label=False,
+                interactive=False,
+                height=220,
+                elem_id="mascot"
+            )
+            submit_button = gr.Button("Run", variant="primary")
+    gr.Markdown(
+        """
+### Prompt Squirrel RAG (pipeline version)
+Type a rough prompt. This tool rewrites it and aligns it to an e621-style tag vocabulary using Prompt Squirrel internally,
+then returns a cleaned, model-friendly prompt.
+        """.strip()
+    )
+    console = gr.Textbox(
+        label="Console",
+        lines=10,
+        interactive=False,
+        placeholder="Progress logs will appear here."
+    )
+    final_prompt = gr.Textbox(
+        label="Final Prompt",
+        lines=3,
+        interactive=False,
+        placeholder="Your optimized prompt will appear here."
+    )
+    submit_button.click(
+        rag_pipeline_ui,
+        inputs=[image_tags],
+        outputs=[console, final_prompt]
+    )
+    image_tags.submit(
+        rag_pipeline_ui,
+        inputs=[image_tags],
+        outputs=[console, final_prompt]
+    )
+if __name__ == "__main__":
+    app.queue().launch(allowed_paths=[str(MASCOT_DIR)])

data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/retrieval_contract.md ADDED Viewed

	@@ -0,0 +1,197 @@

+# Retrieval Contract -- Stage 2 (Retrieval Grounding / Candidate Generation)
+Stage 2 performs **retrieval grounding** over a **closed vocabulary** of canonical e621-style tags.
+It does not "tag images", and it does not do free-form generation. Its job is to produce a high-recall,
+inspectable candidate pool for downstream **closed-set selection**.
+---
+## Inputs
+- `rewrite_phrases: list[str]`
+  - Output of Stage 1 query rewriting (comma-separated "tag-shaped" phrases).
+  - Not canonical tags. Not underscored. High recall is preferred.
+- `allow_nsfw_tags: bool`
+  - If false, filter out tags in the project's `nsfw_tags` set.
+- `verbose: bool`
+  - If true, return per-phrase debug reports.
+---
+## Normalization and phrase expansion
+1) Normalize rewrite phrases for internal processing:
+- lowercase
+- strip leading/trailing whitespace
+- collapse internal whitespace to a single space
+2) Treat the phrase list as a **set** (dedupe after normalization).
+3) **Head-noun expansion**:
+- For each multi-token phrase, add its head noun (last token) as an additional phrase.
+- Apply the same set semantics so duplicates are processed once.
+Example:
+- Input phrases: `["big shirt", "grey shirt"]`
+- Final phrase set: `{"big shirt", "grey shirt", "shirt"}`
+---
+## Candidate generation per phrase (FastText neighbors + canonicalization)
+For each phrase `p` in the final phrase set:
+1) Convert to lookup form:
+- `lookup = p.replace(" ", "_")`
+2) Retrieve neighbors using FastText:
+- `neighbors = fasttext.most_similar(lookup, topn=per_phrase_k)`
+- Note: FastText neighbors may include alias tokens and other non-canonical strings.
+3) **Project neighbor tokens to canonical tags** (alias -> canonical):
+- If a neighbor token is already a canonical tag (token is in `tag_counts` OR token has a TF-IDF row in `tag_to_row_index`), it maps to itself.
+- Else if it is an alias, map it via `alias2tags[token]` (may map to multiple canonical tags).
+- Else, drop it (not in closed vocabulary).
+4) **Deduplicate by canonical tag** within this phrase:
+- Keep the canonical tag with the highest FastText similarity among all tokens that mapped to it.
+- Record the token that achieved that max similarity as `alias_token` for verbose reporting ("best token wins").
+5) **Exact-match injection**:
+- Project the phrase's own `lookup` through the same projection logic.
+- For each canonical tag produced by that projection, inject it into the candidate set with:
+  - `score_fasttext = 1.0`
+  - `alias_token = lookup`
+- This ensures the phrase canonical appears even though `most_similar()` often does not return the query token itself.
+6) Apply NSFW filtering (if `allow_nsfw_tags=False`):
+- Drop candidate canonical tags that are present in `nsfw_tags`.
+Result: for each phrase, we have a set of canonical candidate tags with:
+- `score_fasttext`
+- `alias_token` (token that produced the best FastText score for that canonical tag)
+---
+## Context similarity (TF-IDF -> SVD cosine)
+Stage 2 computes one **query context vector** for the entire request:
+1) Build a pseudo TF-IDF vector from the **final phrase set** (deduped + head nouns):
+- Convert each phrase to underscore form (same `lookup` rule).
+- Terms that exist in the TF-IDF vocabulary (underscore lookups) contribute `(term_count * idf(term))`.
+- OOV terms contribute nothing (but may be reported in verbose mode).
+2) Project to SVD space and L2-normalize:
+- `query_vec = normalize(svd.transform(tfidf_vec))`
+If the query vector has zero norm (no recognized TF-IDF terms), then `query_has_context = False` and:
+- `score_context = None` for all candidates
+- `score_combined = score_fasttext` (FastText-only)
+If `query_has_context = True`, compute per-candidate cosine similarity when possible:
+- For tags that have a TF-IDF/SVD row: `score_context_by_tag[tag] = dot(query_vec, reduced_matrix_norm[row])`
+- For tags that lack a TF-IDF/SVD row: initial `score_context = None` (may be imputed per-phrase)
+### Missing context policy (per-phrase, q=0.10)
+If `query_has_context = True` and a candidate tag has `score_context = None`:
+- For that phrase, compute `default_context_for_phrase` as the 10th percentile (q=0.10) of the available (non-None) context scores among that phrase's candidates.
+- If there are no available context scores for that phrase, `default_context_for_phrase = 0.0`.
+- Impute missing context scores using `default_context_for_phrase` and mark:
+  - `context_imputed = True`
+Otherwise:
+- `context_imputed = False`
+---
+## Score fusion (FastText + Context)
+Compute a fused score per phrase candidate:
+- If `query_has_context = False`:
+  - `score_combined = score_fasttext`
+- Else:
+  - `score_combined = (1 - context_weight) * score_fasttext + context_weight * score_context`
+  - (`score_context` may be imputed as described above)
+---
+## Per-phrase truncation and must-include rule
+After scoring candidates for a phrase:
+- Sort by `score_combined` descending.
+- Keep top `per_phrase_final_k` (typically 10).
+**Must-include rule (pinned exact phrase tags)**:
+- Let `required_tags` be the canonical tag(s) produced by projecting the phrase's own `lookup` (`projected_lookup`).
+- Each required tag must appear in that phrase's final top `per_phrase_final_k` list, even if its fused score would otherwise place it below the cutoff.
+- If the list is full, evict the lowest-ranked tag that is *not* required.
+- Note: `required_tags` may contain multiple canonicals if `alias2tags` maps a token to multiple tags.
+This rule applies **only to the phrase's own required tags**. It does not inject tags into other phrases' lists.
+---
+## Merge across phrases (global candidate pool)
+A canonical tag may appear in multiple per-phrase top-K lists. Stage 2 deduplicates tags into a single global record.
+- `sources` is the union of phrases whose per-phrase lists contained the tag.
+- `score_fasttext` is the maximum FastText score observed for the tag across those phrases.
+- `score_context` is the maximum context cosine observed for the tag across those phrases (with `None` treated as missing).
+- `score_combined` is the maximum fused score observed for the tag across those phrases.
+Note:
+- These maxima may come from different phrases; the global candidate row does not necessarily correspond to any single phrase's row.
+- For tags with a TF-IDF row, `score_context` is phrase-invariant. Differences across phrases only arise for tags whose context score was imputed.
+Finally:
+- Sort global candidates by `score_combined` descending.
+- Return top `global_k` candidates (and optionally all candidates if the app needs them).
+---
+## Output schema
+### Stage 2 return (non-verbose)
+- `candidates: list[Candidate]` (ordered)
+  - `tag: str` (canonical)
+  - `score_combined: float`
+  - `score_fasttext: float | None`
+  - `score_context: float | None` (None only when `query_has_context=False` or when missing)
+  - `count: int | None`
+  - `sources: list[str]`
+### Optional per-phrase debug report (verbose)
+For each phrase:
+- `phrase: str`
+- `normalized: str`
+- `lookup: str`
+- `tfidf_vocab: bool` (lookup is in TF-IDF vocabulary)
+- `oov_terms: list[str]`
+- `candidates: list[CandidateRow]` (top per-phrase list)
+  - `tag: str`
+  - `alias_token: str`
+  - `score_fasttext: float`
+  - `score_context: float | None`
+  - `score_combined: float`
+  - `context_imputed: bool`
+  - `count: int | None`
+---
+## Determinism and performance constraints
+- Artifact loading is **lazy** (load-on-first-use, cached thereafter).
+- No feature flags for old/new behavior: delete old code paths.
+- Logging must be read-only and must not affect results.
+---
+## NSFW tag source
+- `nsfw_tags` is sourced from `word_rating_probabilities.csv` with `NSFW_THRESHOLD=0.95` as implemented in `psq_rag.retrieval.state`.

docs/rewrite_contract.md ADDED Viewed

	@@ -0,0 +1,141 @@

+# Stage 1 — Query Rewriting Contract
+## Purpose
+Stage 1 (“Query Rewriting”) converts a free-form natural-language prompt into a
+comma-separated list of short, tag-shaped phrases suitable for downstream
+retrieval over a closed image-tag vocabulary.
+This stage is not tagging, not normalization, and not validation.
+Its sole role is to rewrite user intent into a retrieval-friendly surface form
+with high recall.
+---
+## Inputs
+- User prompt: an arbitrary string entered by the user.
+- The input may include:
+  - natural language
+  - comma-separated phrases
+  - Stable-Diffusion-style parentheses and weights
+  - punctuation and spacing artifacts
+No structural guarantees are assumed about the input.
+---
+## Pre-Rewrite Heuristics (Non-LLM)
+Before the LLM rewrite is invoked, the system performs a lightweight heuristic
+extraction:
+- The prompt is split on "." and ","
+- Segments with three or fewer whitespace-separated tokens are retained
+- Case-insensitive deduplication is applied
+This produces a small list of user-provided phrases that may later be appended
+to the rewrite output for retrieval support.
+This heuristic:
+- is lossy
+- is not authoritative
+- exists only to preserve short explicit phrases if the rewrite fails or omits them
+---
+## Rewrite Mechanism
+Stage 1 uses a single deterministic LLM call with:
+- temperature = 0.0
+- no retries
+- no streaming
+- no structured output enforcement
+The system prompt instructs the model to:
+- output a comma-separated list
+- use short, literal, tag-shaped phrases
+- preserve coherent multi-word visual concepts
+- avoid inventing details
+- avoid demographic inference
+- avoid guessing identities
+The LLM output is treated as plain text.
+---
+## Output Format
+On success, Stage 1 returns:
+- a single string
+- containing comma-separated phrases
+- with arbitrary spacing normalized
+- truncated to a maximum of approximately 800 characters
+No further parsing, validation, or canonicalization is applied at this stage.
+The rewrite may:
+- reorder concepts
+- merge or split phrasing
+- introduce additional generic visual concepts (e.g. "white background")
+---
+## Failure and Fallback Behavior
+If the LLM call:
+- errors
+- produces a refusal-like response
+- returns empty output
+then Stage 1 returns an empty string.
+In downstream stages, this empty rewrite may be supplemented by the heuristic
+phrases extracted earlier, but Stage 1 itself does not attempt recovery.
+---
+## Explicit Non-Guarantees
+Stage 1 does not guarantee that:
+- output phrases correspond to known vocabulary tags
+- phrases are unique
+- phrases are canonicalized
+- phrases are mutually exclusive
+- all user concepts are preserved
+- added concepts reflect ground truth
+Stage 2 must not assume any of the above.
+---
+## Contract Boundary with Stage 2
+Stage 1 guarantees only that:
+- output is a comma-separated list of short phrases
+- phrases are intended to be retrieval queries, not canonical tags
+- output is deterministic for a given input
+Stage 2 is responsible for:
+- normalization
+- deduplication
+- head-noun expansion
+- vocabulary grounding
+- alias handling
+- scoring and ranking
+---
+## Summary (Interview-Safe)
+Stage 1 is a deterministic query-rewriting step that reshapes free-form text into
+retrieval-friendly phrase queries. It intentionally favors recall and
+surface-form alignment over correctness or canonicalization, delegating all
+grounding and validation to later stages.

docs/stage3_contract.md ADDED Viewed

	@@ -0,0 +1,170 @@

+STAGE 3 CONTRACT: CLOSED-SET SELECTION
+Purpose
+-------
+Stage 3 performs closed-set selection over the candidate set produced by Stage 2.
+It must output only canonical tags drawn from the provided candidates.
+No hallucinated or novel tags are permitted.
+Stage 3 is not retrieval. Stage 2 already performs candidate generation and
+retrieval grounding. Stage 3 is selection / reranking only.
+Inputs
+------
+1) User prompt
+- original_prompt: str
+  The user's original text prompt. This is the primary semantic signal used by
+  Stage 3.
+2) Candidate set (from Stage 2)
+- candidates: List[Candidate]
+Each Candidate corresponds to one canonical tag.
+Required fields:
+- tag: str
+  Canonical tag name (e621-style snake_case). Unique within this list.
+- count: Optional[int]
+  Frequency/count from the tag corpus. Used only as a hint or ordering signal.
+Optional fields (may be present but must not be required by Stage 3):
+- score_fasttext: Optional[float]
+- score_context: Optional[float]
+- score_combined: Optional[float]
+- alias_token: Optional[str]        (debug / evidence only)
+- sources: Optional[List[str]]      (debug / evidence only)
+Contract note:
+Stage 3 must not rely on optional fields to function correctly.
+3) Selection mode parameters (system-controlled)
+These are not user-facing.
+- mode: "single_shot" | "chunked_map_union"
+If mode == "chunked_map_union":
+- chunk_size: int          (e.g., 50–80)
+- per_chunk_budget: int    (soft cap, e.g., 10–20)
+Optional:
+- debug_rationale: bool    (default false in production)
+LLM-Facing Representation
+-------------------------
+Candidates are presented to the LLM as an indexed list per call.
+For each call:
+- Indices are local to that call: 1..N_local
+- A mapping idx -> canonical tag is maintained by the system
+Each candidate line should include:
+- local index
+- canonical tag
+- optionally count
+Example:
+27. blue_fur (count=12034)
+Indices are not required to be stable across calls and must be mapped back
+immediately after parsing.
+Outputs
+-------
+Primary output:
+- selected_tags: List[str]
+  Canonical tag names. Must be a subset of the provided candidate tags.
+Optional outputs (recommended for development and smoke tests):
+- why_by_tag: Dict[str, str]
+  Compact rationale code per selected tag (only if debug_rationale == true).
+- stage3_diagnostics: Dict[str, Any]
+  Parse and validation statistics (for testing and analysis).
+Per-Call LLM Output Schema
+-------------------------
+Each LLM call must return valid JSON of the following form:
+{
+  "selections": [
+    { "i": 27, "why": "explicit" },
+    { "i": 6,  "why": "strong_implied" }
+  ]
+}
+Fields:
+- i: int
+  Local index within that call.
+- why: str
+  Rationale code (required only if debug_rationale == true).
+Allowed rationale codes:
+- explicit
+- strong_implied
+- weak_implied
+- style_or_meta
+- other
+Validation Rules
+----------------
+Per-call validation:
+- selections is a list
+- i is an integer
+- 1 <= i <= N_local
+- indices are unique within the call
+- if debug_rationale == true, why must be one of the allowed codes
+Global validation (after mapping indices to tags):
+- every selected tag must exist in the Stage 2 candidate set
+- duplicates removed by canonical tag identity
+- final selected_tags is the deterministic result of mapping and union
+Policy note:
+If NSFW tags are disallowed, Stage 2 must remove them. Stage 3 does not require
+policy flags as input. Defense-in-depth checks are allowed but not required.
+Chunking and Aggregation Behavior
+--------------------------------
+Single-shot mode:
+- One LLM call over all candidates
+- Output parsed, validated, and mapped
+Chunked Map + Union mode (no LLM reduce):
+- Split candidate list into chunks of size chunk_size
+- For each chunk:
+  - enumerate locally 1..N_local
+  - run one LLM call
+  - parse and validate
+  - map indices to canonical tags immediately
+- Aggregate across chunks by union on canonical tag:
+  - why_by_tag[tag] chosen by majority vote or first occurrence
+No second LLM consolidation or pruning call is implied or required.
+Ordering of Final Output
+------------------------
+The final selected_tags list must be ordered deterministically using:
+1) descending why score (as defined by the system)
+2) tie-break by descending count
+Smoke Test Requirements
+-----------------------
+Stage 3 smoke tests should report:
+- JSON parse success rate
+- invalid index rate
+- duplicate index rate
+- selection size distribution
+- union size distribution (chunked mode)
+- stability across repeated runs on identical input
+- quality metrics vs ground truth where available:
+  precision / recall / F1 over tag sets
+Smoke test results are used to empirically choose between single_shot and
+chunked_map_union for typical candidate set sizes.

e621naturallanguagedataset.txt ADDED Viewed

	@@ -0,0 +1,140 @@

+Dataset Card for furry-e621-sfw-7m-hq
+Dataset Summary
+This is 6.92 M captions of the images from the safe-for-work (SFW) split of e621 ("e926"). It extends to January 2023, before the widespread advent of machine learning images. It includes captions created by LLMs and a custom multilabel classifier along with CogVLM. There are 8 LLM (mistralai/Mistral-7B-v0.1) and 1 CogVLM (THUDM/CogVLM) captions per image.
+Most captions are substantially larger than 77 tokens and are unsuitable for discrimination using current CLIP-based approaches.
+Languages
+The captions are in English.
+Original Categorized Tags For LLM Captions
+The tags were selected for safe-for-work attributes and were filtered down to approximately 7,000 tags. A multilabel classifier was created using DINOv2 giant (facebook/dinov2-giant) with the pooled output of the visual encoder. The classifier was trained with APL loss (gamma -4, -6, and -8) for 1000 epochs and the best model achieved an AP of 0.342 and F1 of 0.5576.
+Of these tags, they were categorized manually to the follow labels:
+    animals_and_anthropomorphic_features
+    clothing_and_accessories
+    characters_and_gender
+    hairstyle
+    background_and_setting
+    number_of_characters
+    miscellaneous
+    actions_and_poses
+    colors
+    furniture_and_objects
+    body_and_body_parts
+    emotions_and_expressions
+Data Instances
+An example of a row:
+{
+    "id": 3556547,
+    "md5": "1ae8668745b8fefb83e79d0c77e31a4e",
+    "caption_cogvlm": "The image depicts an anthropomorphic creature, possibly a possum, sitting at a desk in front of a computer. The creature has a somewhat disgruntled expression, with fur that is a mix of black and white. The background is muted, with a grayish tone, and the desk has a yellowish hue. The creature is wearing a black shirt and is seen typing on the keyboard. The overall mood of the image is somber and introspective.",
+    "caption_llm_0": "a solo female marsupial, specifically a possum, sitting on a chair in front of a simple background. She has black hair and is wearing clothing accessories like a laptop and keyboard. The possum has anthropomorphic features such as bipedalism, snout, and whiskers. She is holding an object while painting or typing at the computer desk with her fingers. The background consists of sky and clouds with countershading present.",
+    "caption_llm_1": "A solo female marsupial, specifically a possum, sitting at a desk with her black-furred body and white-furred face. she is wearing black clothing and a shirt, as well as having pink markings on her nose. the possum is holding an object while painting or typing on the computer. her hair is black, and she has grey fur on her body with brown fur accents. she has breasts and tufts of hair on her head. the background consists of furniture such as a chair, table, laptop, monitor, keyboard and container in various shades of white or grey colors.",
+    "caption_llm_2": "a female possum, sitting on a chair and looking at the viewer. She has black hair and is wearing a black shirt over her white body with two-tone fur. The background is simple, with a sky visible through the window. She's holding an object while painting at her desk, which also contains a laptop, monitor, keyboard, and computer mouse. Her nose is pink and she has narrowed eyes as she reacts to something in the scene.",
+    "caption_llm_3": "A solo female marsupial, specifically a possum, sitting on a chair in front of a computer. the possum has white fur on its body and black fur on its face, creating two-tone fur. it is wearing black clothing and has pink markings on its nose. the background shows furniture such as a desk, table, and container with various objects like laptop, monitor, computer mouse and keyboard. the possum's hair is black while it looks at the viewer with narrowed eyes and half-closed eyes. it also holds an object in one hand while painting with the other hand or typing at the computer.",
+    "caption_llm_4": "a solo, female marsupial possum sitting at a desk, typing on a laptop while wearing a black shirt and white fur. The background is detailed and set indoors. The animal has humanoid hands, bipedal stance, chest tufts, and pink nose. It's clothed in black clothing with black ears and topwear. The possum's body has two-tone fur - one color being white and the other being black. A computer or table can be seen in the background as part of the furniture setting.",
+    "caption_llm_5": "A solo, female marsupial, specifically a possum, sitting and looking at an object while typing. the animal has humanoid hands and is clothed in black clothing with black ears and a black shirt. its fur is two-toned with white body and white fur, as well as pink nose. the background is detailed and set indoors.",
+    "caption_llm_6": "a solo, female marsupial possum sitting at a desk, typing on a laptop while wearing a shirt and displaying humanoid hands. The animal has bags under its eyes, eye bags, and tufts of fur on its chest. It also shows teeth when it frowns or displays anger with clenched teeth. The possum's hair is visible in the artwork.",
+    "caption_llm_7": "A solo, female marsupial possum sitting at a desk, typing on a laptop while wearing black clothing and black topwear. the possum has bipedal humanoid features, chest tufts, bags under its eyes, and breasts. its fur is two-toned with black ears and white body/fur. it also has visible teeth and pink nose. the background includes furniture such as a computer and table.",
+    "tags_synthetic_categorized": "{\"animals_and_anthropomorphic_features\":[\"anthro\",\"biped\",\"feral\",\"snout\",\"whiskers\"],\"number_of_characters\":[\"solo\"],\"clothing_and_accessories\":[\"clothing\",\"fur\",\"topwear\",\"clothed\",\"shirt\"],\"characters_and_gender\":[\"female\"],\"furniture_and_objects\":[\"computer\",\"furniture\",\"table\",\"laptop\",\"chair\",\"desk\",\"container\",\"computer_mouse\",\"keyboard\",\"monitor\"],\"colors\":[\"white_body\",\"white_fur\",\"two_tone_fur\",\"black_body\",\"black_fur\",\"two_tone_body\",\"black_clothing\",\"black_topwear\",\"black_shirt\",\"pink_nose\",\"grey_body\",\"grey_fur\",\"brown_body\",\"brown_fur\",\"black_nose\"],\"actions_and_poses\":[\"sitting\",\"looking_at_viewer\",\"holding_object\",\"painting\",\"typing\",\"standing\",\"on_chair\",\"looking_at_object\"],\"hairstyle\":[\"hair\",\"black_hair\"],\"background_and_setting\":[\"inside\",\"simple_background\",\"outside\",\"detailed_background\",\"sky\",\"cloud\",\"countershading\"],\"body_and_body_parts\":[\"breasts\",\"tuft\",\"teeth\",\"fingers\",\"markings\",\"5_fingers\",\"eyebrows\",\"arm_support\",\"eye_bags\"],\"miscellaneous\":[\"text\"],\"emotions_and_expressions\":[\"open_mouth\",\"smile\",\"narrowed_eyes\",\"reaction_image\",\"half-closed_eyes\"],\"species_or_animal_type\":[\"didelphid\",\"mammal\",\"virginia_opossum\",\"marsupial\"]}\r\n",
+    "tags_ground_truth_categorized": "{\"emotions_and_expressions\":[\"angry\",\"clenched_teeth\",\"frown\",\"reaction_image\",\"teeth_showing\"],\"animals_and_anthropomorphic_features\":[\"anthro\",\"biped\",\"chest_tuft\",\"humanoid_hands\"],\"body_and_body_parts\":[\"bags_under_eyes\",\"breasts\",\"eye_bags\",\"teeth\",\"teeth_visible\",\"tuft\"],\"colors\":[\"black_clothing\",\"black_ears\",\"black_shirt\",\"black_topwear\",\"pink_nose\",\"two_tone_body\",\"two_tone_fur\",\"white_body\",\"white_fur\"],\"clothing_and_accessories\":[\"clothed\",\"clothing\",\"fur\",\"shirt\",\"t-shirt\",\"topwear\"],\"furniture_and_objects\":[\"computer\",\"desk\",\"furniture\",\"laptop\",\"table\"],\"background_and_setting\":[\"detailed_background\",\"inside\"],\"characters_and_gender\":[\"female\"],\"hairstyle\":[\"hair\"],\"actions_and_poses\":[\"looking_at_object\",\"sitting\",\"typing\"],\"number_of_characters\":[\"solo\"],\"species_or_animal_type\":[\"mammal\",\"marsupial\",\"possum\"]}\r\n",
+}
+LLM-derived Captions
+The caption_llm_x field was produced with the following prompt using the mistralai/Mistral-7B-v0.1 weights:
+Please make a detailed description, one paragraph long, of the image using this JSON of categorized tags:
+{{ tags }}
+For every nth image where n was odd, the text "artwork of {{ characters }}." was appended for all characters with >= 10 images.
+For the first 1-4 captions, synthetic tags were used. For the last 5-8 captions, ground truth tags were used.
+For every caption, two categories were dropped out of the categorized tags each time (excluding species) to force the LLM to focus on different aspects of the image.
+For a small number of images, LLM captions were not computed. These are left as empty strings for these images.
+CogVLM-derived Captions
+The caption_cogvlm field was produced with the following prompt using the THUDM/CogVLM weights:
+Please make a detailed description, one paragraph long, of the image using this JSON of categorized tags:
+{{ tags }}
+The tags provided were the ground truth, categorized tags.
+CogVLM captions often display repetitive prefixes. You can remove them with:
+REPEATED_OPENINGS = [
+  ('The image showcases ', ''),
+  ('The image portrays ', ''),
+  ('The image appears to be ', ''),
+  ('The image is ', ''),
+  ('The image depicts ', ''),
+  ('The image features ', ''),
+  ('This image showcases ', ''),
+  ('This image portrays ', ''),
+  ('This image appears to be ', ''),
+  ('This image is ', ''),
+  ('This image depicts ', ''),
+  ('This image features ', ''),
+  ('In this picture, ', ''),
+  ('In this artwork, ', 'Artwork of '),
+  ('In this illustration, ', 'Illustration of '),
+  ('In this depiction, ', ''),
+  ('In this piece, ', ''),
+  ('In this image, ', ''),
+  ('In this art piece, ', 'Art of '),
+  ('In this scene, ', ''),
+]
+def postprocess_caption(caption: str):
+  for often_repeated, replacer in REPEATED_OPENINGS:
+    if often_repeated in caption:
+      caption = caption.replace(often_repeated, replacer, 1).capitalize()
+  return caption
+Data Splits
+	train
+furry-e621-sfw-7m-hq 	768859
+Dataset Creation
+Source Data
+Collected from e621 according to their rate-limiting instructions on archiving content.
+Discussion of Biases
+The captions are biased to the results of the multilabel classifier and the CogVLM model.
+Known Limitations
+The LLM derived captions commonly hallucinate text and may contain a small amount of captions that are corrupted by repeating tokens or tag lists. The CogVLM derived captions have more correct OCR but may also occasionally hallucinate text or small details.
+For a small number of images, LLM captions were not computed. These are left as empty strings for these images.
+While the images are labeled as "safe", they were not inspected for safety and may contain inappropriate subject matter.
+Additional Information
+Dataset Curators
+Caption Emporium
+Downloading the Images
+Please refer to this issue.
+Licensing Information
+The dataset is available under the Creative Commons ShareAlike (CC BY-SA 4.0).
+Citation Information
+@misc{furry-e621-sfw-7m-hq,
+  author = { Caption Emporium },
+  title = {furry-e621-sfw-7m-hq},
+  year = {2024},
+  publisher = {Huggingface},
+  journal = {Huggingface repository},
+  howpublished = {\url{https://huggingface.co/datasets/CaptionEmporium/furry-e621-sfw-7m-hq}},
+}

fluffyrock_3m.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

mascotimages/transparentsquirrel.png ADDED Viewed

Git LFS Details

SHA256: 8e18321c9051b82ab18932ef9ed4052915659b83ef2065050600d0c06bddb9e7
Pointer size: 131 Bytes
Size of remote file: 257 kB

predict_all_tags_from_dump.ipynb ADDED Viewed

	@@ -0,0 +1,721 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "55c95870",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import gzip\n",
+    "from math import log\n",
+    "from collections import Counter\n",
+    "from sys import maxsize\n",
+    "import numpy as np\n",
+    "import joblib\n",
+    "from collections import OrderedDict\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from collections import defaultdict\n",
+    "import sys\n",
+    "from scipy.sparse import dok_matrix\n",
+    "from sklearn.preprocessing import normalize\n",
+    "from sklearn.decomposition import TruncatedSVD\n",
+    "\n",
+    "\n",
+    "\n",
+    "posts_file = 'posts-2024-04-14.csv.gz'\n",
+    "fluffyrock_tags_list_file = 'fluffyrock_3m.csv'\n",
+    "\n",
+    "\n",
+    "def extract_artist_names(file_path):\n",
+    "    \"\"\"\n",
+    "    Extract artist names from a CSV file where each row contains tag information,\n",
+    "    and the first column contains the tag's name. Artist tags start with 'by_'.\n",
+    "\n",
+    "    :param file_path: Path to the CSV file\n",
+    "    :return: A set containing artist names without the 'by_' prefix\n",
+    "    \"\"\"\n",
+    "    artists = set()\n",
+    "\n",
+    "    # Open the CSV file and read it\n",
+    "    with open(file_path, newline='', encoding='utf-8') as csvfile:\n",
+    "        reader = csv.reader(csvfile)\n",
+    "        \n",
+    "        # Iterate over each row in the CSV file\n",
+    "        for row in reader:\n",
+    "            tag_name = row[0]  # Assuming the first column contains the tag names\n",
+    "            if tag_name.startswith('by_'):\n",
+    "                # Strip 'by_' from the start of the tag name and add it to the set\n",
+    "                artist_name = tag_name[3:]  # Remove the first three characters 'by_'\n",
+    "                artists.add(tag_name)\n",
+    "\n",
+    "    return artists\n",
+    "\n",
+    "\n",
+    "def build_tag_list(tags, e621_rating_character, fav_count, artist_names):\n",
+    "    results = []\n",
+    "    \n",
+    "    #score\n",
+    "    score_value = min(1.0, (log(int(fav_count)+1) / 10))\n",
+    "    rounded_score_value = round(score_value * 10)\n",
+    "    results.append(f\"score: {rounded_score_value}\")\n",
+    "        \n",
+    "    #rating\n",
+    "    results.append(\"rating:\" + e621_rating_character)\n",
+    "    \n",
+    "    #regular tags and artists\n",
+    "    for tag in tags:\n",
+    "        if tag in artist_names:\n",
+    "            results.append(\"by_\" + tag)\n",
+    "        else:\n",
+    "            results.append(tag)\n",
+    "    return results\n",
+    "\n",
+    "\n",
+    "def read_csv_as_dict(file_path):\n",
+    "    \"\"\"\n",
+    "    Generator function to read a gzipped CSV file and yield each row as a dictionary\n",
+    "    where keys are the column names and values are the data in each column.\n",
+    "\n",
+    "    :param file_path: Path to the .csv.gz file\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    #counter=0\n",
+    "    with gzip.open(file_path, 'rt', newline='', encoding='utf-8') as gz_file:\n",
+    "        csv.field_size_limit(1000000)\n",
+    "        reader = csv.DictReader(gz_file)\n",
+    "        for row in reader:\n",
+    "            #counter += 1\n",
+    "            #if counter % 100 == 0:\n",
+    "            yield row\n",
+    "            \n",
+    "            \n",
+    "def process_tags_from_csv(file_path, artist_names):\n",
+    "    \"\"\"\n",
+    "    Generator function that reads rows from a CSV file, processes each row to extract and\n",
+    "    build tag lists, and yields these lists one at a time.\n",
+    "\n",
+    "    :param file_path: The path to the gzipped CSV file.\n",
+    "    :param artist_names: A set containing all artist names for tag processing.\n",
+    "    :return: Yields lists of tags for each row.\n",
+    "    \"\"\"\n",
+    "    for row in read_csv_as_dict(file_path):\n",
+    "        base_tags = row['tag_string'].split(' ')\n",
+    "        rating_character = row['rating']\n",
+    "        fav_count = row['fav_count']\n",
+    "        all_tags = build_tag_list(base_tags, rating_character, fav_count, artist_names)\n",
+    "        yield all_tags\n",
+    "        \n",
+    "        \n",
+    "def construct_pseudo_vector(pseudo_doc_terms, idf_loaded, tag_to_column_loaded):\n",
+    "    # Initialize a vector of zeros with the length of the term_to_index mapping\n",
+    "    pseudo_vector = np.zeros(len(tag_to_column_loaded))\n",
+    "    \n",
+    "    # Fill in the vector for terms in the pseudo document\n",
+    "    for term in pseudo_doc_terms:\n",
+    "        if term in tag_to_column_loaded:\n",
+    "            index = tag_to_column_loaded[term]\n",
+    "            pseudo_vector[index] = idf_loaded.get(term, 0)\n",
+    "    \n",
+    "    # Return the vector as a 2D array for compatibility with SVD transform\n",
+    "    return pseudo_vector.reshape(1, -1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a9becfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_artist_names = extract_artist_names(fluffyrock_tags_list_file)\n",
+    "\n",
+    "tag_count = Counter()\n",
+    "min_occurrences = 200\n",
+    "    \n",
+    "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
+    "    tag_count.update(all_tags)\n",
+    "    \n",
+    "\n",
+    "# Apply the counting logic from the first code snippet\n",
+    "sorted_tags = tag_count.most_common()\n",
+    "filtered_tags = [tag for tag, count in sorted_tags if count >= min_occurrences]\n",
+    "\n",
+    "# Print tag counts before and after filtering\n",
+    "print(\"Tag count before filtering: \", len(tag_count))\n",
+    "print(\"Tag count after filtering: \", len(filtered_tags))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56f8d7cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize a dictionary to hold the co-occurrences for each tag in filtered_tags\n",
+    "# Using a nested defaultdict for automatic handling of missing keys\n",
+    "pseudo_docs = defaultdict(lambda: defaultdict(int))\n",
+    "\n",
+    "# Number of tags processed\n",
+    "total_rows_processed = 0\n",
+    "\n",
+    "# Read each row and process the tags\n",
+    "for all_tags in process_tags_from_csv(posts_file, all_artist_names):\n",
+    "    # Filter the tags in the current list to include only those in filtered_tags\n",
+    "    filtered_tag_list = [tag for tag in all_tags if tag in filtered_tags]\n",
+    "    \n",
+    "    # For each tag in the filtered list\n",
+    "    for tag in filtered_tag_list:\n",
+    "        # For each co-occurring tag in the same list\n",
+    "        for co_occur_tag in filtered_tag_list:\n",
+    "            if co_occur_tag != tag:\n",
+    "                pseudo_docs[tag][co_occur_tag] += 1\n",
+    "\n",
+    "    # Counting total tags processed for progress monitoring\n",
+    "    total_rows_processed += 1\n",
+    "    if total_rows_processed % 10000 == 0:\n",
+    "        print(f\"Processed {total_rows_processed} rows\", file=sys.stderr)\n",
+    "\n",
+    "print(\"Processing complete.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1d011a5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of pseudo-documents\n",
+    "N = len(pseudo_docs)\n",
+    "\n",
+    "# Calculate TF and DF\n",
+    "tf = {}\n",
+    "df = {}\n",
+    "for doc, terms in pseudo_docs.items():\n",
+    "    tf[doc] = {}\n",
+    "    total_terms = sum(terms.values())\n",
+    "    for term, count in terms.items():\n",
+    "        tf[doc][term] = count / total_terms  # Term Frequency\n",
+    "        df[term] = df.get(term, 0) + 1  # Document Frequency\n",
+    "        \n",
+    "# Ensure all terms are indexed\n",
+    "all_terms = set(df.keys())\n",
+    "term_to_column_index = {term: idx for idx, term in enumerate(all_terms)}\n",
+    "\n",
+    "# Calculate IDF\n",
+    "idf = {term: log((N + 1) / (df_val + 1)) for term, df_val in df.items()}  # Adding 1 to prevent division by zero\n",
+    "\n",
+    "# Initialize the TF-IDF matrix\n",
+    "tfidf_matrix = dok_matrix((N, len(df)), dtype=float)\n",
+    "\n",
+    "# Mapping of tags to matrix rows\n",
+    "tag_to_row = {tag: idx for idx, tag in enumerate(pseudo_docs)}\n",
+    "\n",
+    "# Compute TF-IDF and fill the matrix\n",
+    "for doc, terms in tf.items():\n",
+    "    row_idx = tag_to_row[doc]\n",
+    "    for term, tf_val in terms.items():\n",
+    "        col_idx = term_to_column_index[term]  # Use term_to_index for column indexing\n",
+    "        tfidf_matrix[row_idx, col_idx] = tf_val * idf[term]\n",
+    "\n",
+    "# Convert to CSR format for efficient row slicing\n",
+    "tfidf_matrix = tfidf_matrix.tocsr()\n",
+    "\n",
+    "print(\"TF-IDF matrix shape:\", tfidf_matrix.shape)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b098a5fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Choose the number of components for the reduced dimensionality\n",
+    "n_components = 300  # For example, reducing to 300 dimensions\n",
+    "\n",
+    "# Initialize the TruncatedSVD object\n",
+    "svd = TruncatedSVD(n_components=n_components, random_state=42)\n",
+    "\n",
+    "# Fit and transform the TF-IDF matrix\n",
+    "reduced_matrix = svd.fit_transform(tfidf_matrix)\n",
+    "\n",
+    "# 'reduced_matrix' now has a shape of (8500, n_components), e.g., (8500, 300)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "023ae26f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06ec21c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Step 1: Construct TF vector for the pseudo-document\n",
+    "pseudo_doc_terms = [\"female\"]\n",
+    "pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)\n",
+    "\n",
+    "# Assuming 'tfidf_matrix' is your original TF-IDF matrix and 'reduced_matrix' is obtained from Truncated SVD\n",
+    "# 'pseudo_tfidf_vector' is the TF-IDF vector for your pseudo-document, constructed as previously discussed\n",
+    "\n",
+    "# For the original TF-IDF matrix\n",
+    "# Compute cosine similarities\n",
+    "cosine_similarities_full = cosine_similarity(pseudo_tfidf_vector, tfidf_matrix).flatten()\n",
+    "print(\"Cosine similarities (full matrix):\", cosine_similarities_full)\n",
+    "# Identify the indices of the top 10 most similar tags\n",
+    "top_indices_full = np.argsort(cosine_similarities_full)[-10:][::-1]\n",
+    "\n",
+    "# For the reduced matrix\n",
+    "# Reduce the dimensionality of the pseudo-document vector\n",
+    "# Before calculating similarities, print the TF-IDF vectors\n",
+    "print(\"Pseudo TF-IDF vector:\", pseudo_tfidf_vector)\n",
+    "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
+    "print(\"Reduced pseudo-document vector:\", reduced_pseudo_vector)\n",
+    "\n",
+    "# Compute cosine similarities in the reduced space\n",
+    "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
+    "print(\"Cosine similarities (reduced matrix):\", cosine_similarities_reduced)\n",
+    "\n",
+    "\n",
+    "# Identify the indices of the top 10 most similar tags in the reduced space, sorted from most to least similar\n",
+    "top_indices_reduced = np.argsort(cosine_similarities_reduced)[-10:][::-1]\n",
+    "\n",
+    "\n",
+    "# Convert indices to tag names using the inverse of your 'tag_to_row' mapping\n",
+    "# Printing the tag to index and index to tag mappings\n",
+    "print(\"tag_to_row mapping (partial):\", dict(list(tag_to_row.items())[:12]))  # Print only first 10 for brevity\n",
+    "row_to_tag = {idx: tag for tag, idx in tag_to_row.items()}\n",
+    "print(\"row_to_tag mapping (partial):\", dict(list(row_to_tag.items())[:12]))\n",
+    "\n",
+    "# Generate lists of tags with their corresponding similarity scores\n",
+    "top_tags_full = [(row_to_tag[idx], cosine_similarities_full[idx]) for idx in top_indices_full]\n",
+    "top_tags_reduced = [(row_to_tag[idx], cosine_similarities_reduced[idx]) for idx in top_indices_reduced]\n",
+    "\n",
+    "# Output the results with scores\n",
+    "print(\"Most similar tags (Full Matrix):\")\n",
+    "for tag, score in top_tags_full:\n",
+    "    print(f\"{tag}: {score:.4f}\")\n",
+    "\n",
+    "print(\"Most similar tags (Reduced Matrix):\")\n",
+    "for tag, score in top_tags_reduced:\n",
+    "    print(f\"{tag}: {score:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91753fa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Save the model to a file\n",
+    "\n",
+    "# Package necessary components\n",
+    "components_to_save = {\n",
+    "    'idf': idf,\n",
+    "    'tag_to_column_index': term_to_column_index,\n",
+    "    'row_to_tag': row_to_tag, \n",
+    "    'reduced_matrix': reduced_matrix,\n",
+    "    'svd_model': svd\n",
+    "}\n",
+    "\n",
+    "# Save the components into a file\n",
+    "joblib.dump(components_to_save, 'components_file418.joblib')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e08dc1a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d066db2f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most similar tags (Reduced Matrix):\n",
+      "nameless_(arbuzbudesh): 0.0000\n",
+      "knotted_dildo: 0.0000\n",
+      "black_legs: 0.0000\n",
+      "disguise: 0.0000\n",
+      "lineup: 0.0000\n",
+      "olympics: 0.0000\n",
+      "burping: 0.0000\n",
+      "pink_collar: 0.0000\n",
+      "team_rocket: 0.0000\n",
+      "studded_bracelet: 0.0000\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Reload and test file\n",
+    "\n",
+    "# Load the saved components from the joblib file\n",
+    "components = joblib.load('tf_idf_files_418_updated.joblib')\n",
+    "\n",
+    "# Extract necessary components\n",
+    "idf = components['idf']\n",
+    "term_to_column_index = components['tag_to_column_index']\n",
+    "row_to_tag = components['row_to_tag']\n",
+    "reduced_matrix = components['reduced_matrix']\n",
+    "svd = components['svd_model']\n",
+    "\n",
+    "# Construct the TF-IDF vector for \"domestic_dog\"\n",
+    "pseudo_tfidf_vector = construct_pseudo_vector(\"blue_(jurassic_world)\", idf, term_to_column_index)\n",
+    "\n",
+    "# Reduce the dimensionality of the pseudo-document vector for the reduced matrix\n",
+    "reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)\n",
+    "\n",
+    "# Compute cosine similarities in the reduced space\n",
+    "cosine_similarities_reduced = cosine_similarity(reduced_pseudo_vector, reduced_matrix).flatten()\n",
+    "\n",
+    "# Sort the indices by descending cosine similarity\n",
+    "top_indices_reduced = np.argsort(cosine_similarities_reduced)[::-1][:10]\n",
+    "\n",
+    "# Display the most similar tags in the reduced matrix with their scores\n",
+    "print(\"Most similar tags (Reduced Matrix):\")\n",
+    "for idx in top_indices_reduced:\n",
+    "    tag = row_to_tag[idx]\n",
+    "    score = cosine_similarities_reduced[idx]\n",
+    "    print(f\"{tag}: {score:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ddea5f32",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74897a5c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c0c5b32d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ff9a331",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "91c66b57",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a830c6cf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cdc98f0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "150d66f3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "337b1f65",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34d2fde1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fc197d8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bfa9c299",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "551a8453",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0dcdeb9e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "537c9e26",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa873abf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41aca76f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36a3ae96",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb59bac3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39c87db9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1646e731",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99f95d09",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d6a67c2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32acbfd7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c17cd42",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d333776c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e8c7511",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acf35591",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "101fb083",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8bd8551",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "271b9c12",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a232e088",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43df0240",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dbb05e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9730cb16",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d38f92b2",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "879f5463",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

psq_rag/__init__.py ADDED Viewed

File without changes

psq_rag/llm/__init__.py ADDED Viewed

File without changes

psq_rag/llm/openrouter_client.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os, json
+from typing import Any, Dict, List, Optional, Tuple
+import httpx
+OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY", "")
+OPENROUTER_MODEL = os.environ.get("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
+def _extract_json_object(text: str) -> Optional[dict]:
+    """
+    Best-effort: find the first top-level JSON object in a response.
+    Works even if the model wraps JSON with prose or code fences.
+    """
+    if not text:
+        return None
+    # Strip common fences
+    t = text.strip()
+    t = t.removeprefix("```json").removeprefix("```").removesuffix("```").strip()
+    # Find first {...} span
+    start = t.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    for i in range(start, len(t)):
+        if t[i] == "{":
+            depth += 1
+        elif t[i] == "}":
+            depth -= 1
+            if depth == 0:
+                chunk = t[start:i+1]
+                try:
+                    return json.loads(chunk)
+                except Exception:
+                    return None
+    return None
+def openrouter_chat(
+    messages: List[Dict[str, str]],
+    response_format: Optional[Dict[str, Any]] = None,
+    temperature: float = 0.2,
+    max_tokens: int = 512,
+    timeout_s: float = 30.0,
+) -> Tuple[Optional[str], Optional[dict], Optional[str]]:
+    """
+    Returns (raw_text, parsed_json, error_str).
+    Never raises.
+    Instrumented to help diagnose moderation/routing variance:
+    - includes HTTP status
+    - includes OpenRouter error message/code if provided
+    """
+    if not OPENROUTER_API_KEY:
+        return None, None, "OPENROUTER_API_KEY missing"
+    payload: Dict[str, Any] = {
+        "model": OPENROUTER_MODEL,
+        "messages": messages,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+    }
+    if response_format is not None:
+        payload["response_format"] = response_format
+    headers = {
+        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
+        "Content-Type": "application/json",
+        "HTTP-Referer": "https://huggingface.co/spaces",
+        "X-Title": "Prompt_Squirrel_RAG",
+    }
+    try:
+        with httpx.Client(timeout=timeout_s) as client:
+            r = client.post(
+                "https://openrouter.ai/api/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            data = r.json()
+            choice0 = data["choices"][0]
+            content = (choice0["message"].get("content", "") or "").strip()
+            finish_reason = choice0.get("finish_reason")
+            native_finish_reason = choice0.get("native_finish_reason")
+            # (optional) expose these as part of error_str for logging
+            meta = []
+            if data.get("model"):
+                meta.append(f"model={data['model']}")
+            if finish_reason:
+                meta.append(f"finish={finish_reason}")
+            if native_finish_reason:
+                meta.append(f"native_finish={native_finish_reason}")
+            if isinstance(data.get("usage"), dict):
+                u = data["usage"]
+                if "prompt_tokens" in u and "completion_tokens" in u:
+                    meta.append(f"tokens={u['prompt_tokens']}+{u['completion_tokens']}")
+            parsed = _extract_json_object(content)
+            # If it looks filtered, flag it
+            if finish_reason == "content_filter":
+                return content, parsed, f"Filtered (content_filter; {'; '.join(meta)})"
+            # If it looks refusal-like but not content_filter, still flag it
+            if content.lower().startswith(("i can't", "i can’t", "i cannot", "can't", "cannot")):
+                return content, parsed, f"Refusal-like ({'; '.join(meta)})"
+            return content, parsed, None
+    except Exception as e:
+        return None, None, f"{type(e).__name__}: {e}"
+if __name__ == "__main__":
+    print("openrouter_client.py imports ok")

psq_rag/llm/rewrite.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from .openrouter_client import openrouter_chat
+REWRITE_SYSTEM = """Rewrite the input into a concise, comma-separated list of short phrases
+that resemble image tags.
+Use short, literal phrases that reflect how visual concepts are commonly
+written in image tag vocabularies.
+Multi-word phrases are appropriate when they represent one coherent
+visual idea.
+Examples of tag-shaped phrases:
+- wolf, angry
+- blue jacket, striped tail
+- long hair, raised ears
+- holding object, hand on shoulder
+- looking at viewer, looking down
+- simple background, outdoor scene
+- wooden table, plant
+- running, sleeping
+- smiling, angry expression
+- bedroom, forest
+- sonic the hedgehog, princess peach
+Do not invent details or guess identities.
+Do not infer demographic attributes (e.g., gender/age) unless explicitly stated.
+Output ONLY the rewritten list.
+"""
+def llm_rewrite_prompt(prompt_in: str, log) -> str:
+    messages = [
+        {"role": "system", "content": REWRITE_SYSTEM},
+        {"role": "user", "content": prompt_in},
+    ]
+    raw, _parsed_unused, err = openrouter_chat(
+        messages,
+        response_format=None,
+        temperature=0.0,
+        max_tokens=256,
+    )
+    if err:
+        log(f"LLM rewrite: fallback (error: {err})")
+        # NEW: if we got a refusal-like completion, log the refusal text for debugging
+        if raw and err.lower().startswith("refusal-like"):
+            log(f"LLM rewrite refusal text: {raw.strip()[:300]}")
+        return ""
+    out = (raw or "").strip()
+    if not out:
+        log("LLM rewrite: fallback (empty response)")
+        return ""
+    out = " ".join(out.split())
+    if len(out) > 800:
+        out = out[:800].rstrip()
+    log("LLM rewrite: ok")
+    return out
+if __name__ == "__main__":
+    print("rewrite.py imports ok")

psq_rag/llm/select.py ADDED Viewed

	@@ -0,0 +1,711 @@

+# psq_rag/llm/select.py
+# Stage 3: Closed-Set Selection (LangChain-only implementation)
+#
+# This module intentionally uses LangChain for:
+# - prompt templating (including {N})
+# - LLM call orchestration
+# - JSON parsing
+#
+# There is NO fallback path. If LangChain dependencies are missing, this module
+# should fail loudly so you install them.
+import os
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union, cast, Literal
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import PydanticOutputParser
+from pydantic import BaseModel, Field, SecretStr
+from rapidfuzz import fuzz
+from psq_rag.retrieval.psq_retrieval import Candidate  # Candidate(tag, score_*, count, sources)
+from psq_rag.retrieval.state import get_tag_type_name, get_tag2aliases
+WHY_ENUM = ["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
+# Deterministic mapping: ordinal "why" -> numeric score for ordering/debug.
+WHY_TO_SCORE: Dict[str, float] = {
+    "explicit": 0.90,
+    "strong_implied": 0.70,
+    "weak_implied": 0.45,
+    "style_or_meta": 0.35,
+    "other": 0.25,
+}
+# IMPORTANT ABOUT TEMPLATING:
+# - This string is rendered by LangChain's f-string template engine.
+# - Literal JSON braces must be escaped as {{ and }}.
+# - {N} is a real template variable and MUST be provided.
+SELECT_SYSTEM_TEMPLATE = """You are given a description of an image and a list of imageboard tags.
+Select the tags that correspond to content that would be visible or depicted in the described image.
+The list contains only valid tags; many of them are irrelevant to the image.
+Return JSON ONLY matching this schema:
+{{
+  \"selections\": [
+    {{\"i\": <int>, \"why\": \"<one of: explicit|strong_implied|weak_implied|style_or_meta|other>\"}},
+    ...
+  ]
+}}
+Rules:
+- Choose ONLY from indices 1..{N}.
+- Do NOT output tag text.
+- Do NOT output any keys other than \"selections\", and inside each item only the item index \"i\" and \"why\".
+- Do select both a general tag and a more specific tag when both apply (for example, \"shirt\" and \"grey shirt\").
+Define \"why\" as:
+- explicit: directly stated in the image description
+- strong_implied: very likely given the description, even if not literally stated
+- weak_implied: plausible but not strongly supported by the description
+- style_or_meta: stylistic or presentation-related tags only if clearly indicated
+- other: fallback category; use sparingly
+"""
+ENTITY_SYSTEM_TEMPLATE = """You are given a description of an image and a list of CHARACTER tags.
+These character tags have already been pre-filtered to only include characters whose names
+(or known aliases) appear in the image description. Your job is to confirm which of these
+pre-filtered candidates are the correct match for the character mentioned by the user.
+Return JSON ONLY matching this schema:
+{{
+  \"selections\": [
+    {{\"i\": <int>, \"why\": \"explicit\"}},
+    ...
+  ]
+}}
+Rules for character selection:
+- Choose ONLY from indices 1..{N}.
+- Do NOT output tag text.
+- Always use \"why\": \"explicit\" for all selections.
+- Select the tag that best represents the character as described.
+- If the user described a specific variant (e.g. \"pikachu libre\", \"detective pikachu\"),
+  select that specific variant tag.
+- If the user described only the base character (e.g. just \"pikachu\"), select only the
+  base/default tag, NOT costume or variant tags.
+- When uncertain between variants, prefer the simplest/most general tag.
+"""
+USER_TEMPLATE = """IMAGE DESCRIPTION:
+{image_description}
+CANDIDATES (choose by index only):
+{candidate_lines}
+Select up to {per_call_budget} indices. Output fewer if uncertain.
+"""
+@dataclass(frozen=True)
+class Selected:
+    i: int
+    tag: str  # canonical tag (underscore form)
+    why: str
+    score: float
+WhyLiteral = Literal["explicit", "strong_implied", "weak_implied", "style_or_meta", "other"]
+class Stage3SelectionItem(BaseModel):
+    i: int = Field(..., description="1-based index into the candidate list.")
+    why: WhyLiteral = Field(..., description="Rationale code from the allowed set.")
+class Stage3SelectionResponse(BaseModel):
+    selections: List[Stage3SelectionItem] = Field(default_factory=list)
+def _build_response_format() -> Dict[str, Any]:
+    # Strict JSON Schema structured output.
+    schema = {
+        "type": "object",
+        "properties": {
+            "selections": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "i": {"type": "integer"},
+                        "why": {"type": "string", "enum": WHY_ENUM},
+                    },
+                    "required": ["i", "why"],
+                    "additionalProperties": False,
+                },
+            }
+        },
+        "required": ["selections"],
+        "additionalProperties": False,
+    }
+    return {
+        "type": "json_schema",
+        "json_schema": {
+            "name": "stage3_selection",
+            "strict": True,
+            "schema": schema,
+        },
+    }
+def _get_llm(*, temperature: float, max_tokens: int, response_format: Dict[str, Any]) -> ChatOpenAI:
+    api_key = os.getenv("OPENROUTER_API_KEY")
+    if not api_key:
+        raise RuntimeError(
+            "OPENROUTER_API_KEY is not set.\n"
+            "Set it in your environment before running Stage 3."
+        )
+    api_key = SecretStr(cast(str, api_key))
+    model = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
+    headers: Dict[str, str] = {}
+    if referer := os.getenv("OPENROUTER_HTTP_REFERER"):
+        headers["HTTP-Referer"] = referer
+    if title := os.getenv("OPENROUTER_X_TITLE"):
+        headers["X-Title"] = title
+    # OpenRouter OpenAI-compatible endpoint.
+    return ChatOpenAI(
+        model=model,
+        base_url="https://openrouter.ai/api/v1",
+        api_key=api_key,
+        temperature=temperature,
+        max_completion_tokens=max_tokens,
+        default_headers=headers,
+        # Provider-specific request body fields (OpenAI-compatible).
+        # Response Healing plugin reduces malformed-JSON failures (syntax only).
+        extra_body={
+            "response_format": response_format,
+            "plugins": [{"id": "response-healing"}],
+        },
+    )
+def _phrase_key_for_candidate(c: Candidate) -> str:
+    # Deterministic "primary phrase" for grouping.
+    if c.sources:
+        return sorted(c.sources)[0]
+    return ""
+def _interleave_round_robin(cands: Sequence[Candidate]) -> List[Candidate]:
+    """Round-robin interleave by primary source phrase.
+    NOTE: counts are used only for ordering; they are NOT shown to the LLM.
+    """
+    groups: Dict[str, List[Candidate]] = {}
+    for c in cands:
+        k = _phrase_key_for_candidate(c)
+        groups.setdefault(k, []).append(c)
+    for k in groups:
+        groups[k].sort(key=lambda x: (x.score_combined, (x.count or -1)), reverse=True)
+    keys = sorted(groups.keys())
+    out: List[Candidate] = []
+    idx = 0
+    while True:
+        progressed = False
+        for k in keys:
+            if idx < len(groups[k]):
+                out.append(groups[k][idx])
+                progressed = True
+        if not progressed:
+            break
+        idx += 1
+    return out
+def _display_tag(tag: str) -> str:
+    # Display tags with spaces for the LLM, but keep canonical underscores internally.
+    return tag.replace("_", " ")
+def _format_candidates_local(
+    cands: Sequence[Candidate],
+) -> Tuple[str, Dict[int, str], Dict[int, Candidate]]:
+    lines: List[str] = []
+    idx_to_tag: Dict[int, str] = {}
+    idx_to_candidate: Dict[int, Candidate] = {}
+    for j, c in enumerate(cands, start=1):
+        idx_to_tag[j] = c.tag
+        idx_to_candidate[j] = c
+        lines.append(f"{j}. {_display_tag(c.tag)}")
+    return "\n".join(lines), idx_to_tag, idx_to_candidate
+def _phrases_in_call(cands: Sequence[Candidate]) -> int:
+    s = set()
+    for c in cands:
+        for src in c.sources:
+            s.add(src)
+    return len(s)
+def _parse_validate_map(
+    parsed: Any,
+    idx_to_tag: Dict[int, str],
+    per_call_budget: int,
+) -> Tuple[List[Selected], Dict[str, Any]]:
+    diag = {
+        "parse_ok": isinstance(parsed, dict),
+        "invalid_items": 0,
+        "oob_indices": 0,
+        "dupe_indices": 0,
+        "kept": 0,
+    }
+    if isinstance(parsed, BaseModel):
+        parsed = parsed.model_dump() if hasattr(parsed, "model_dump") else parsed.dict()
+        diag["parse_ok"] = isinstance(parsed, dict)
+    if not isinstance(parsed, dict):
+        return [], diag
+    selections = parsed.get("selections", [])
+    if not isinstance(selections, list):
+        diag["parse_ok"] = False
+        return [], diag
+    out: List[Selected] = []
+    seen_i = set()
+    for item in selections:
+        if len(out) >= per_call_budget:
+            break
+        if not isinstance(item, dict):
+            diag["invalid_items"] += 1
+            continue
+        i = item.get("i")
+        why = item.get("why")
+        if isinstance(i, bool) or not isinstance(i, int):
+            diag["invalid_items"] += 1
+            continue
+        if i in seen_i:
+            diag["dupe_indices"] += 1
+            continue
+        if i not in idx_to_tag:
+            diag["oob_indices"] += 1
+            continue
+        if not isinstance(why, str) or why not in WHY_ENUM:
+            diag["invalid_items"] += 1
+            continue
+        seen_i.add(i)
+        tag = idx_to_tag[i]
+        out.append(Selected(i=i, tag=tag, why=why, score=WHY_TO_SCORE[why]))
+    diag["kept"] = len(out)
+    return out, diag
+def _split_candidates_by_type(
+    candidates: List[Candidate],
+    log,
+) -> Tuple[List[Tuple[int, Candidate]], List[Tuple[int, Candidate]]]:
+    """Split candidates into general vs entity (character only) lists.
+    Returns:
+        (general_list, entity_list) where each item is (original_index, candidate)
+    Tag types:
+        - General: 0 (general), 1 (artist), 5 (species), 7 (meta)
+        - Entity: 4 (character) only
+        - Filtered: 3 (copyright) - too broad for image generation
+    """
+    general_with_idx: List[Tuple[int, Candidate]] = []
+    entity_with_idx: List[Tuple[int, Candidate]] = []
+    unknown_count = 0
+    copyright_count = 0
+    for idx, cand in enumerate(candidates):
+        type_name = get_tag_type_name(cand.tag)
+        if type_name == "character":
+            entity_with_idx.append((idx, cand))
+        elif type_name == "copyright":
+            # Filter out copyright/series tags - too broad for image generation
+            copyright_count += 1
+        elif type_name in ("general", "artist", "species", "meta"):
+            general_with_idx.append((idx, cand))
+        else:
+            # Unknown or None - treat as general by default
+            general_with_idx.append((idx, cand))
+            unknown_count += 1
+    if log:
+        log(
+            f"Stage3 split: "
+            f"general={len(general_with_idx)} "
+            f"entity={len(entity_with_idx)} "
+            f"copyright_filtered={copyright_count} "
+            f"unknown_type={unknown_count}"
+        )
+    return general_with_idx, entity_with_idx
+# Regex to strip series/franchise suffixes from aliases, e.g. _(sonic), _(mlp), _(character)
+_SERIES_SUFFIX_RE = re.compile(r"_\([^)]+\)$")
+def _normalize_for_matching(text: str) -> str:
+    """Lowercase, replace underscores with spaces, strip series suffixes."""
+    text = text.lower().strip()
+    text = _SERIES_SUFFIX_RE.sub("", text)
+    text = text.replace("_", " ")
+    return text
+def _query_words(query: str) -> Set[str]:
+    """Extract individual words from the user query for matching."""
+    return set(_normalize_for_matching(query).split())
+def _alias_matches_query(alias_norm: str, query_words: Set[str], query_norm: str,
+                         fuzzy_threshold: int = 85) -> bool:
+    """Check if an alias matches the user query.
+    Matching logic:
+    1. Exact substring: alias appears as a substring of the query
+    2. Word subset: all words in the alias appear in the query words
+    3. Fuzzy: alias is close to a word in the query (handles typos)
+    """
+    # Exact substring match
+    if alias_norm in query_norm:
+        return True
+    alias_words = alias_norm.split()
+    if not alias_words:
+        return False
+    # Word subset match: all alias words must appear in query
+    if all(w in query_words for w in alias_words):
+        return True
+    # For single-word aliases, try fuzzy matching against each query word
+    if len(alias_words) == 1:
+        for qw in query_words:
+            if fuzz.ratio(alias_words[0], qw) >= fuzzy_threshold:
+                return True
+    # For multi-word aliases, try fuzzy partial ratio against whole query
+    if len(alias_words) > 1:
+        if fuzz.partial_ratio(alias_norm, query_norm) >= fuzzy_threshold:
+            return True
+    return False
+def _character_matches_via_aliases(
+    tag: str,
+    query: str,
+    tag2aliases: Dict[str, List[str]],
+    query_words: Set[str],
+    query_norm: str,
+    fuzzy_threshold: int = 85,
+) -> bool:
+    """Check if a character tag matches the user query via its aliases.
+    For a character tag to match:
+    - The tag name itself (normalized) must match, OR
+    - At least one of its registered aliases must match.
+    Empty aliases list means no known aliases; still check the tag name itself.
+    """
+    # Check the tag name itself
+    tag_norm = _normalize_for_matching(tag)
+    if _alias_matches_query(tag_norm, query_words, query_norm, fuzzy_threshold):
+        return True
+    # Check all registered aliases
+    aliases = tag2aliases.get(tag, [])
+    for alias in aliases:
+        alias_norm = _normalize_for_matching(alias)
+        if not alias_norm:
+            continue
+        if _alias_matches_query(alias_norm, query_words, query_norm, fuzzy_threshold):
+            return True
+    return False
+def llm_select_indices(
+    query_text: str,                 # kept for compatibility; treated as IMAGE DESCRIPTION
+    candidates: Union[
+        Sequence[Candidate],
+        Sequence[str],
+        Sequence[Tuple[str, float]],
+    ],
+    max_pick: int,                         # legacy param; applied after union + ordering (optional)
+    log,
+    retries: int = 2,
+    *,
+    mode: str = "chunked_map_union",       # "single_shot" or "chunked_map_union"
+    chunk_size: int = 60,
+    per_phrase_k: int = 2,                 # per-call budget = per_phrase_k * phrases_in_call
+    temperature: float = 0.0,
+    max_tokens: int = 512,
+) -> List[int]:
+    """Return indices into the ORIGINAL candidates list (legacy interface).
+    This implementation uses LangChain ONLY.
+    NOTE: query_text is treated as the image description (original prompt).
+    """
+    image_description = query_text
+    # Normalize candidates:
+    # - preferred: List[Candidate]
+    # - legacy: List[(tag, sim)] (count/sources unavailable)
+    norm: List[Candidate] = []
+    tag_to_first_index: Dict[str, int] = {}
+    branch = "empty"
+    cand0_type = type(candidates[0]).__name__ if candidates else "none"
+    if candidates and isinstance(candidates[0], Candidate):
+        branch = "candidate"
+        typed_candidates = cast(Sequence[Candidate], candidates)
+        for idx, c in enumerate(typed_candidates):
+            if c.tag not in tag_to_first_index:
+                tag_to_first_index[c.tag] = idx
+                norm.append(c)
+    elif candidates and isinstance(candidates[0], str):
+        branch = "string"
+        typed_candidates = cast(Sequence[str], candidates)
+        for idx, tag in enumerate(typed_candidates):
+            if tag not in tag_to_first_index:
+                tag_to_first_index[tag] = idx
+                norm.append(
+                    Candidate(
+                        tag=tag,
+                        score_combined=0.0,
+                        score_fasttext=None,
+                        score_context=None,
+                        count=None,
+                        sources=[],
+                    )
+                )
+    else:
+        if candidates:
+            branch = "tuple"
+        typed_candidates = cast(Sequence[Tuple[str, float]], candidates)
+        for idx, row in enumerate(typed_candidates):
+            if not isinstance(row, (list, tuple)) or len(row) < 2:
+                raise ValueError("Stage 3 candidates must be Candidate, tag strings, or (tag, score) tuples.")
+            tag, sim = row[0], row[1]
+            if tag not in tag_to_first_index:
+                tag_to_first_index[tag] = idx
+                norm.append(
+                    Candidate(
+                        tag=tag,
+                        score_combined=float(sim),
+                        score_fasttext=None,
+                        score_context=None,
+                        count=None,
+                        sources=[],
+                    )
+                )
+    if log:
+        if norm:
+            log(
+                "Stage3 input: "
+                f"type0={cand0_type} "
+                f"branch={branch} "
+                f"norm0_score={norm[0].score_combined!r} "
+                f"norm0_sources_empty={not bool(norm[0].sources)}"
+            )
+        else:
+            log(f"Stage3 input: type0={cand0_type} branch={branch} (no candidates)")
+    if mode not in ("single_shot", "chunked_map_union"):
+        raise ValueError(f"Invalid mode: {mode}")
+    response_format = _build_response_format()
+    llm = _get_llm(temperature=temperature, max_tokens=max_tokens, response_format=response_format)
+    model_name = os.getenv("OPENROUTER_MODEL", "meta-llama/llama-3.1-8b-instruct")
+    parser = PydanticOutputParser(pydantic_object=Stage3SelectionResponse)
+    # Global union: tag -> best (score, why)
+    best: Dict[str, Tuple[float, str]] = {}
+    def run_call(call_cands: Sequence[Candidate], label: str, system_template: str) -> None:
+        # Create chain with the provided system template
+        prompt = ChatPromptTemplate.from_messages(
+            [
+                ("system", system_template),
+                ("human", USER_TEMPLATE),
+            ],
+            template_format="f-string",
+        )
+        chain = prompt | llm | parser
+        ordered = _interleave_round_robin(call_cands)
+        candidate_lines, idx_to_tag, idx_to_candidate = _format_candidates_local(ordered)
+        N_local = len(idx_to_tag)
+        phrases = _phrases_in_call(call_cands)
+        per_call_budget = max(1, per_phrase_k * phrases) if phrases > 0 else per_phrase_k
+        summary_logged = False
+        if log:
+            log(f"Stage3 {label}: candidates (local indices):\n{candidate_lines}")
+            if phrases > 0:
+                distinct_phrases = sorted({src for c in call_cands for src in c.sources})
+                log(
+                    f"Stage3 {label}: distinct_phrases={len(distinct_phrases)} "
+                    f"phrases={', '.join(distinct_phrases)}"
+                )
+        # Invoke LangChain chain (templating fills {N} and other vars)
+        for att in range(retries + 1):
+            try:
+                if log:
+                    log(
+                        f"Stage3 {label}: "
+                        f"model={model_name} "
+                        f"N={N_local} "
+                        f"phrases={phrases} "
+                        f"per_call_budget={per_call_budget} "
+                        f"response_healing=on"
+                    )
+                parsed = chain.invoke(
+                    {
+                        "N": N_local,
+                        "image_description": image_description,
+                        "candidate_lines": candidate_lines,
+                        "per_call_budget": per_call_budget,
+                    }
+                )
+                selected, diag = _parse_validate_map(parsed, idx_to_tag, per_call_budget=per_call_budget)
+                if log:
+                    log(f"Stage3 {label}: attempt {att+1} diag={diag}")
+                    if not summary_logged and (selected or att == retries):
+                        log(
+                            f"Stage3 {label}: summary "
+                            f"N={N_local} selected={len(selected)} per_call_budget={per_call_budget}"
+                        )
+                        summary_logged = True
+                    if selected:
+                        lines = [
+                            f"Stage3 {label} selections:",
+                            *[
+                                (
+                                    f'  - i={s.i} tag="{s.tag}" '
+                                    f"why={s.why} score={s.score:.2f} "
+                                    f"sources={idx_to_candidate.get(s.i).sources if idx_to_candidate.get(s.i) else []}"
+                                )
+                                for s in selected
+                            ],
+                        ]
+                        log("\n".join(lines))
+                    else:
+                        log(f"Stage3 {label} selections: (none)")
+                if selected:
+                    for s in selected:
+                        prev = best.get(s.tag)
+                        if prev is None or s.score > prev[0]:
+                            best[s.tag] = (s.score, s.why)
+                    return
+            except Exception as e:
+                if log:
+                    log(f"Stage3 {label}: attempt {att+1} error: {e}")
+        if log:
+            log(f"Stage3 {label}: gave up after {retries+1} attempts")
+    # Split candidates by type (general vs entity)
+    general_with_idx, entity_with_idx = _split_candidates_by_type(norm, log)
+    # Extract just the candidates for LLM calls
+    general_cands = [cand for _, cand in general_with_idx]
+    entity_cands = [cand for _, cand in entity_with_idx]
+    # Process general candidates (attributes, actions, species, etc.)
+    if general_cands:
+        if mode == "single_shot":
+            run_call(general_cands, "general_single_shot", SELECT_SYSTEM_TEMPLATE)
+        else:
+            for start in range(0, len(general_cands), chunk_size):
+                run_call(
+                    general_cands[start:start + chunk_size],
+                    f"general_chunk_{start//chunk_size}",
+                    SELECT_SYSTEM_TEMPLATE
+                )
+    # Process entity candidates (characters only) with alias-based pre-filtering
+    if entity_cands:
+        tag2aliases = get_tag2aliases()
+        qwords = _query_words(image_description)
+        qnorm = _normalize_for_matching(image_description)
+        filtered_entity_cands: List[Candidate] = []
+        filtered_out: List[str] = []
+        for cand in entity_cands:
+            if _character_matches_via_aliases(
+                cand.tag, image_description, tag2aliases, qwords, qnorm
+            ):
+                filtered_entity_cands.append(cand)
+            else:
+                filtered_out.append(cand.tag)
+        if log:
+            log(
+                f"Stage3 entity alias filter: "
+                f"before={len(entity_cands)} "
+                f"after={len(filtered_entity_cands)} "
+                f"removed={len(filtered_out)}"
+            )
+            if filtered_out:
+                log(f"Stage3 entity alias filter removed: {filtered_out[:20]}")
+        if filtered_entity_cands:
+            if mode == "single_shot":
+                run_call(filtered_entity_cands, "entity_single_shot", ENTITY_SYSTEM_TEMPLATE)
+            else:
+                for start in range(0, len(filtered_entity_cands), chunk_size):
+                    run_call(
+                        filtered_entity_cands[start:start + chunk_size],
+                        f"entity_chunk_{start//chunk_size}",
+                        ENTITY_SYSTEM_TEMPLATE
+                    )
+    # Deterministic ordering: derived score desc, tie-break by count desc (count not shown to LLM).
+    count_by_tag = {c.tag: (c.count if c.count is not None else -1) for c in norm}
+    ordered_tags = sorted(best.keys(), key=lambda t: (best[t][0], count_by_tag.get(t, -1)), reverse=True)
+    # Legacy cap: apply AFTER union + ordering.
+    if isinstance(max_pick, int) and max_pick > 0:
+        ordered_tags = ordered_tags[:max_pick]
+    # Map back to original indices
+    out_idx: List[int] = []
+    for t in ordered_tags:
+        if t in tag_to_first_index:
+            out_idx.append(tag_to_first_index[t])
+    return out_idx

psq_rag/parsing/__init__.py ADDED Viewed

File without changes

psq_rag/parsing/prompt_grammar.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import re
+from lark import Lark, Token
+#Parser
+grammar=r"""
+!start: (prompt | /[][():]/+)*
+prompt: (emphasized | plain | comma | WHITESPACE)*
+!emphasized: "(" prompt ")"
+        | "(" prompt ":" [WHITESPACE] NUMBER [WHITESPACE] ")"
+comma: ","
+WHITESPACE: /\s+/
+plain: /([^,\\\[\]():|]|\\.)+/
+%import common.SIGNED_NUMBER -> NUMBER
+"""
+# Initialize the parser
+parser = Lark(grammar, start='start')
+# Function to extract tags
+def extract_tags(tree):
+    tags_with_positions = []
+    def _traverse(node):
+        if isinstance(node, Token) and node.type == '__ANON_1':
+            tag_position = node.start_pos
+            tag_text = node.value
+            tags_with_positions.append((tag_text, tag_position, "tag"))
+        elif not isinstance(node, Token):
+            for child in node.children:
+                _traverse(child)
+    _traverse(tree)
+    return tags_with_positions
+def build_tag_offsets_dicts(new_image_tags_with_positions):
+    # Structure the data for HighlightedText
+    tag_data = []
+    for tag_text, start_pos, nodetype in new_image_tags_with_positions:
+        # Modify the tag
+        modified_tag = tag_text.replace('_', ' ').replace('\\(', '(').replace('\\)', ')').strip()
+        artist_matrix_tag = tag_text.replace('_', ' ').replace('\\(', '\(').replace('\\)', '\)').strip()
+        tf_idf_matrix_tag = re.sub(r'\\([()])', r'\1', re.sub(r' ', '_', tag_text.strip().removeprefix('by ').removeprefix('by_')))
+        # Calculate the end position based on the original tag length
+        end_pos = start_pos + len(tag_text)
+        # Append the structured data for each tag
+        tag_data.append({
+            "original_tag": tag_text,
+            "start_pos": start_pos,
+            "end_pos": end_pos,
+            "modified_tag": modified_tag,
+            "artist_matrix_tag": artist_matrix_tag,
+            "tf_idf_matrix_tag": tf_idf_matrix_tag,
+            "node_type": nodetype
+        })
+    return tag_data
+if __name__ == "__main__":
+    print("prompt_grammar.py imports ok")

psq_rag/pipeline/__init__.py ADDED Viewed

File without changes

psq_rag/pipeline/preproc.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import re
+def extract_user_provided_tags_upto_3_words(prompt_in: str) -> list[str]:
+    """
+    Heuristic:
+    - split on '.' and ','
+    - strip leading/trailing whitespace
+    - split on whitespace
+    - keep items with <= 3 tokens
+    """
+    if not prompt_in:
+        return []
+    parts = re.split(r"[.,]+", prompt_in)
+    out: list[str] = []
+    seen = set()
+    for raw in parts:
+        item = raw.strip()
+        if not item:
+            continue
+        tokens = item.split()
+        if len(tokens) <= 3:
+            key = item.lower()
+            if key not in seen:
+                seen.add(key)
+                out.append(item)
+    return out
+if __name__ == "__main__":
+    print("preproc.py imports ok")

psq_rag/retrieval/__init__.py ADDED Viewed

File without changes

psq_rag/retrieval/psq_retrieval.py ADDED Viewed

	@@ -0,0 +1,500 @@

+from __future__ import annotations
+import json
+import logging
+import math
+import os
+import pathlib
+import re
+from collections import Counter, OrderedDict
+from dataclasses import dataclass
+from itertools import islice
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
+import numpy as np
+import joblib
+from scipy.sparse import csr_matrix
+from .state import (
+    get_fasttext_model,
+    get_tag_counts,
+    get_hnsw_artist_index,
+    get_hnsw_tag_index,
+    get_nsfw_tags,
+    get_tfidf_components,
+    get_tfidf_tag_vectors,
+    get_alias2tags,
+)
+@dataclass(frozen=True)
+class Candidate:
+    tag: str
+    score_combined: float
+    score_fasttext: Optional[float]
+    score_context: Optional[float]
+    count: Optional[int]
+    sources: List[str]
+def _norm_tag_for_lookup(s: str) -> str:
+    # convert "name with spaces" -> "name_with_spaces" and unescape parens
+    return s.replace(' ', '_').replace('\\(', '(').replace('\\)', ')')
+special_tags = ["score:0", "score:1", "score:2", "score:3", "score:4", "score:5", "score:6", "score:7", "score:8", "score:9", "rating:s", "rating:q", "rating:e"]
+def remove_special_tags(original_string):
+    tags = [tag.strip() for tag in original_string.split(",")]
+    remaining_tags = [tag for tag in tags if tag not in special_tags]
+    removed_tags = [tag for tag in tags if tag in special_tags]
+    return ", ".join(remaining_tags), removed_tags
+def construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index):
+    cols, data = [], []
+    for term, w in pseudo_doc_terms.items():
+        j = term_to_column_index.get(term)
+        if j is None:
+            continue
+        cols.append(j)
+        data.append(w * idf[j])
+    n_cols = len(idf)
+    indptr = [0, len(cols)]
+    return csr_matrix((data, cols, indptr), shape=(1, n_cols), dtype=np.float32)
+def _ensure_dual_hnsw_indexes():
+    """
+    Build/load two HNSW indexes over the SVD-reduced TF-IDF matrix.
+    """
+    get_hnsw_tag_index()
+    get_hnsw_artist_index()
+    return
+def _hnsw_query(idx, vec: np.ndarray, k: int):
+    """
+    Query a given HNSW index with a (1, D) or (D,) vector in SVD space.
+    Returns (indices, sims) with cosine similarity scores.
+    """
+    q = np.asarray(vec, dtype=np.float32).reshape(-1)
+    q_norm = np.linalg.norm(q)
+    if q_norm > 0:
+        q = q / q_norm
+    labels, dists = idx.knn_query(q, k=k)
+    inds = labels[0]
+    sims = 1.0 - dists[0]  # cosine distance -> similarity
+    return inds, sims
+def _ann_tags_topk(vec: np.ndarray, k: int):
+    idx, n_items = get_hnsw_tag_index()
+    if idx is None:
+        return (np.array([], dtype=int), np.array([], dtype=float))
+    k = min(k, n_items if n_items else 0)
+    return _hnsw_query(idx, vec, k) if k else (np.array([], dtype=int), np.array([], dtype=float))
+def _ann_artists_topk(vec: np.ndarray, k: int):
+    idx, n_items = get_hnsw_artist_index()
+    if idx is None:
+        return (np.array([], dtype=int), np.array([], dtype=float))
+    k = min(k, n_items if n_items else 0)
+    return _hnsw_query(idx, vec, k) if k else (np.array([], dtype=int), np.array([], dtype=float))
+def get_tfidf_reduced_similar_tags(pseudo_doc_terms, allow_nsfw_tags):
+    tf_idf_components = get_tfidf_components()
+    idf = tf_idf_components["idf"]
+    term_to_column_index = tf_idf_components["tag_to_column_index"]
+    row_to_tag = tf_idf_components["row_to_tag"]
+    svd = tf_idf_components["svd_model"]
+    # 1) Build the pseudo TF-IDF, reduce to SVD space (unchanged)
+    pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)
+    reduced_pseudo_vector = svd.transform(pseudo_tfidf_vector)  # shape (1, D)
+    # 2) ANN: only fetch nearest non-artist candidates (no full-matrix cosine)
+    K = 2000  # tune for speed/recall
+    top_inds, top_sims = _ann_tags_topk(reduced_pseudo_vector, k=K)
+    # 3) Build similarity dict from those candidates
+    tag_similarity_dict = {}
+    for i, sim in zip(top_inds, top_sims):
+        tag = row_to_tag.get(int(i))
+        if tag is not None:
+            tag_similarity_dict[tag] = float(sim)
+    if not allow_nsfw_tags:
+        nsfw_tags = get_nsfw_tags()
+        tag_similarity_dict = {t: s for t, s in tag_similarity_dict.items() if t not in nsfw_tags}
+    # 4) Sort & escape like before
+    sorted_tag_similarity_dict = OrderedDict(sorted(tag_similarity_dict.items(), key=lambda x: x[1], reverse=True))
+    transformed_sorted_tag_similarity_dict = OrderedDict(
+        (key.replace('_', ' ').replace('(', '\\(').replace(')', '\\)'), val)
+        for key, val in sorted_tag_similarity_dict.items()
+    )
+    return transformed_sorted_tag_similarity_dict
+def psq_candidates_from_terms(terms: Sequence[str], *, allow_nsfw_tags: bool, k: int = 300):
+    cand_dict = get_tfidf_reduced_similar_tags(dict(Counter(terms)), allow_nsfw_tags)
+    candidates = list(islice(cand_dict.items(), k))
+    tag_counts = get_tag_counts()
+    return [
+        Candidate(
+            tag=tag,
+            score_combined=float(score),
+            score_fasttext=None,
+            score_context=None,
+            count=tag_counts.get(tag),
+            sources=[],
+        )
+        for tag, score in candidates
+    ]
+def psq_candidates_from_rewrite_phrases(
+    rewrite_phrases: Sequence[str],
+    *,
+    allow_nsfw_tags: bool,
+    context_weight: float = 0.5,
+    per_phrase_k: int = 50,
+    per_phrase_final_k: int = 10,
+    global_k: int = 300,
+    verbose: bool = False,
+) -> Union[List[Candidate], Tuple[List[Candidate], List[Dict[str, Any]]]]:
+    head_stopwords = {
+        "and",
+        "or",
+        "the",
+        "a",
+        "an",
+        "of",
+        "to",
+        "in",
+        "on",
+        "at",
+        "with",
+        "for",
+        "from",
+        "by",
+        "as",
+        "is",
+        "are",
+        "was",
+        "were",
+        "be",
+        "been",
+        "being",
+        "down",
+        "up",
+        "over",
+        "under",
+    }
+    def _normalize_phrase(phrase: str) -> str:
+        lowered = (phrase or "").lower().strip().replace("_", " ")
+        return " ".join(lowered.split())
+    norm_phrases = [_normalize_phrase(p) for p in rewrite_phrases]
+    deduped_phrases = list(dict.fromkeys(p for p in norm_phrases if p))
+    if not deduped_phrases:
+        return ([], []) if verbose else []
+    head_phrases: List[str] = []
+    for phrase in deduped_phrases:
+        parts = phrase.split()
+        if len(parts) >= 2:
+            head = parts[-1]
+            if len(head) >= 3 and head.lower() not in head_stopwords:
+                head_phrases.append(head)
+    final_phrases = list(dict.fromkeys(deduped_phrases + head_phrases))
+    fasttext_model = get_fasttext_model()
+    tag_counts = get_tag_counts()
+    nsfw_tags = get_nsfw_tags() if not allow_nsfw_tags else set()
+    alias2tags = get_alias2tags()
+    tfidf_components = get_tfidf_components()
+    tfidf_vocab = tfidf_components.get("tag_to_column_index", {})
+    idf = tfidf_components["idf"]
+    term_to_column_index = tfidf_components["tag_to_column_index"]
+    svd = tfidf_components["svd_model"]
+    pseudo_doc_terms = Counter()
+    oov_terms: List[str] = []
+    for phrase in final_phrases:
+        lookup = phrase.replace(" ", "_")
+        if lookup in term_to_column_index:
+            pseudo_doc_terms[lookup] += 1
+        elif verbose:
+            oov_terms.append(lookup)
+    pseudo_tfidf_vector = construct_pseudo_vector(pseudo_doc_terms, idf, term_to_column_index)
+    reduced_query_vector = svd.transform(pseudo_tfidf_vector).reshape(-1)
+    query_norm = np.linalg.norm(reduced_query_vector)
+    if query_norm > 0:
+        reduced_query_vector = reduced_query_vector / query_norm
+        query_has_context = True
+    else:
+        query_has_context = False
+    tag_vectors = get_tfidf_tag_vectors() if query_has_context else None
+    tag_to_row_index = tag_vectors["tag_to_row_index"] if tag_vectors else {}
+    phrase_candidate_maps: List[Tuple[str, Dict[str, float]]] = []
+    phrase_required_tags: Dict[str, Set[str]] = {}
+    phrase_best_tokens: Dict[str, Dict[str, str]] = {}
+    phrase_context_imputed: Dict[str, Dict[str, bool]] = {}
+    phrase_reports: List[Dict[str, Any]] = []
+    for phrase in final_phrases:
+        lookup = phrase.replace(" ", "_")
+        def _project_to_canonicals(token: str) -> List[str]:
+            if token in tag_counts or token in tag_to_row_index:
+                return [token]
+            if token in alias2tags:
+                return alias2tags[token]
+            return []
+        try:
+            neighbors = fasttext_model.most_similar(lookup, topn=per_phrase_k)
+        except KeyError:
+            neighbors = []
+        per_phrase_candidates: Dict[str, float] = {}
+        per_phrase_best_token: Dict[str, str] = {}
+        for token, sim in neighbors:
+            for canonical_tag in _project_to_canonicals(token):
+                if not allow_nsfw_tags and canonical_tag in nsfw_tags:
+                    continue
+                prev = per_phrase_candidates.get(canonical_tag)
+                if prev is None or sim > prev:
+                    per_phrase_candidates[canonical_tag] = float(sim)
+                    per_phrase_best_token[canonical_tag] = token
+        projected_lookup = _project_to_canonicals(lookup)
+        required_tags = set(projected_lookup)
+        if not allow_nsfw_tags:
+            required_tags = {tag for tag in required_tags if tag not in nsfw_tags}
+        for canonical_tag in projected_lookup:
+            if not allow_nsfw_tags and canonical_tag in nsfw_tags:
+                continue
+            prev = per_phrase_candidates.get(canonical_tag)
+            if prev is None or 1.0 > prev:
+                per_phrase_candidates[canonical_tag] = 1.0
+                per_phrase_best_token[canonical_tag] = lookup
+        phrase_candidate_maps.append((phrase, per_phrase_candidates))
+        phrase_required_tags[phrase] = required_tags
+        phrase_best_tokens[phrase] = per_phrase_best_token
+        if verbose:
+            in_vocab = bool(tfidf_vocab and lookup in tfidf_vocab)
+            rows = []
+            for canonical_tag, sim in sorted(per_phrase_candidates.items(), key=lambda x: x[1], reverse=True):
+                if not allow_nsfw_tags and canonical_tag in nsfw_tags:
+                    continue
+                alias_token = per_phrase_best_token.get(canonical_tag, canonical_tag)
+                rows.append(
+                    {
+                        "tag": canonical_tag,
+                        "alias_token": alias_token,
+                        "score_fasttext": float(sim),
+                        "score_context": None,
+                        "score_combined": float(sim),
+                        "context_imputed": False,
+                        "count": tag_counts.get(canonical_tag),
+                    }
+                )
+            phrase_reports.append(
+                {
+                    "phrase": phrase,
+                    "normalized": phrase,
+                    "lookup": lookup,
+                    "tfidf_vocab": in_vocab,
+                    "oov_terms": oov_terms,
+                    "candidates": rows,
+                }
+            )
+    all_candidate_tags: Set[str] = set()
+    for _, per_phrase_candidates in phrase_candidate_maps:
+        all_candidate_tags.update(per_phrase_candidates.keys())
+    score_context_by_tag: Dict[str, Optional[float]] = {}
+    if query_has_context:
+        reduced_matrix_norm = tag_vectors["reduced_matrix_norm"]
+        for tag in all_candidate_tags:
+            row = tag_to_row_index.get(tag)
+            if row is None:
+                score_context_by_tag[tag] = None
+                continue
+            score_context_by_tag[tag] = float(np.dot(reduced_query_vector, reduced_matrix_norm[row]))
+    else:
+        for tag in all_candidate_tags:
+            score_context_by_tag[tag] = None
+    merged_by_tag: Dict[str, Candidate] = {}
+    per_phrase_scored: Dict[str, List[Tuple[str, float, Optional[float], float]]] = {}
+    for phrase, per_phrase_candidates in phrase_candidate_maps:
+        context_imputed_by_tag: Dict[str, bool] = {}
+        default_context_for_phrase = None
+        if query_has_context:
+            context_scores = [
+                score_context_by_tag.get(tag)
+                for tag in per_phrase_candidates.keys()
+            ]
+            context_scores = [score for score in context_scores if score is not None]
+            if context_scores:
+                context_scores.sort()
+                index = int(math.floor(0.10 * (len(context_scores) - 1)))
+                default_context_for_phrase = float(context_scores[index])
+            else:
+                default_context_for_phrase = 0.0
+        scored_rows: List[Tuple[str, float, Optional[float], float]] = []
+        for tag, score_fasttext in per_phrase_candidates.items():
+            if not allow_nsfw_tags and tag in nsfw_tags:
+                continue
+            score_context = score_context_by_tag.get(tag)
+            context_imputed = False
+            if score_context is None and query_has_context:
+                # Impute missing context with the per-phrase 10th percentile.
+                score_context = default_context_for_phrase
+                context_imputed = True
+            if score_context is None:
+                score_combined = float(score_fasttext)
+            else:
+                score_combined = (1.0 - context_weight) * float(score_fasttext) + context_weight * score_context
+            scored_rows.append((tag, float(score_fasttext), score_context, score_combined))
+            context_imputed_by_tag[tag] = context_imputed
+        scored_rows.sort(key=lambda x: x[3], reverse=True)
+        required_tags = phrase_required_tags.get(phrase, set())
+        if required_tags:
+            scored_by_tag = {row[0]: row for row in scored_rows}
+            top_rows = scored_rows[:per_phrase_final_k]
+            top_tags = {row[0] for row in top_rows}
+            for required_tag in required_tags:
+                if required_tag in top_tags:
+                    continue
+                required_row = scored_by_tag.get(required_tag)
+                if required_row is None:
+                    score_fasttext = per_phrase_candidates.get(required_tag)
+                    score_context = score_context_by_tag.get(required_tag)
+                    if score_fasttext is None:
+                        score_fasttext = 1.0
+                    context_imputed = False
+                    if score_context is None and query_has_context:
+                        score_context = default_context_for_phrase
+                        context_imputed = True
+                    if score_context is None:
+                        score_combined = float(score_fasttext)
+                    else:
+                        score_combined = (1.0 - context_weight) * float(score_fasttext) + context_weight * score_context
+                    required_row = (required_tag, float(score_fasttext), score_context, score_combined)
+                    context_imputed_by_tag[required_tag] = context_imputed
+                if len(top_rows) >= per_phrase_final_k:
+                    drop_index = None
+                    for idx in range(len(top_rows) - 1, -1, -1):
+                        if top_rows[idx][0] not in required_tags:
+                            drop_index = idx
+                            break
+                    if drop_index is None:
+                        drop_index = -1
+                    top_rows.pop(drop_index)
+                top_rows.append(required_row)
+                top_tags.add(required_tag)
+            # Deterministic must-include for exact phrase matches; re-sort top-N by combined score.
+            top_rows.sort(key=lambda x: x[3], reverse=True)
+            scored_rows = top_rows
+        else:
+            scored_rows = scored_rows[:per_phrase_final_k]
+        per_phrase_scored[phrase] = scored_rows
+        phrase_context_imputed[phrase] = context_imputed_by_tag
+        for tag, score_fasttext, score_context, score_combined in scored_rows:
+            existing = merged_by_tag.get(tag)
+            if existing is None:
+                merged_by_tag[tag] = Candidate(
+                    tag=tag,
+                    score_combined=score_combined,
+                    score_fasttext=score_fasttext,
+                    score_context=score_context,
+                    count=tag_counts.get(tag),
+                    sources=[phrase],
+                )
+            else:
+                if phrase not in existing.sources:
+                    existing.sources.append(phrase)
+                existing_fasttext = (
+                    existing.score_fasttext if existing.score_fasttext is not None else float("-inf")
+                )
+                incoming_fasttext = score_fasttext if score_fasttext is not None else float("-inf")
+                max_fasttext = max(existing_fasttext, incoming_fasttext)
+                existing_context = existing.score_context
+                if existing_context is None:
+                    max_context = score_context
+                elif score_context is None:
+                    max_context = existing_context
+                else:
+                    max_context = max(existing_context, score_context)
+                max_combined = max(existing.score_combined, score_combined)
+                merged_by_tag[tag] = Candidate(
+                    tag=tag,
+                    score_combined=max_combined,
+                    score_fasttext=max_fasttext if max_fasttext != float("-inf") else None,
+                    score_context=max_context,
+                    count=existing.count,
+                    sources=existing.sources,
+                )
+    if verbose:
+        for report in phrase_reports:
+            phrase = report["phrase"]
+            rows = []
+            for tag, score_fasttext, score_context, score_combined in per_phrase_scored.get(phrase, []):
+                alias_token = phrase_best_tokens.get(phrase, {}).get(tag, tag)
+                context_imputed = phrase_context_imputed.get(phrase, {}).get(tag, False)
+                rows.append(
+                    {
+                        "tag": tag,
+                        "alias_token": alias_token,
+                        "score_fasttext": score_fasttext,
+                        "score_context": score_context,
+                        "score_combined": score_combined,
+                        "context_imputed": context_imputed,
+                        "count": tag_counts.get(tag),
+                    }
+                )
+            report["candidates"] = rows
+    merged_candidates = list(merged_by_tag.values())
+    merged_candidates.sort(key=lambda c: c.score_combined, reverse=True)
+    merged_candidates = merged_candidates[:global_k]
+    return (merged_candidates, phrase_reports) if verbose else merged_candidates
+def psq_candidates_from_prompt(prompt: str, *, allow_nsfw_tags: bool, k: int = 300):
+    """Return Stage 2 candidates from a raw prompt."""
+    from ..parsing.prompt_grammar import build_tag_offsets_dicts, extract_tags, parser
+    p = (prompt or "").lower()
+    p, removed_special = remove_special_tags(p)
+    parsed = parser.parse(p)
+    tags_with_pos = extract_tags(parsed)
+    tag_data = build_tag_offsets_dicts(tags_with_pos)
+    # These are TF-IDF terms as your pipeline already expects
+    terms = [item["tf_idf_matrix_tag"] for item in tag_data] + removed_special
+    return psq_candidates_from_terms(terms, allow_nsfw_tags=allow_nsfw_tags, k=k)
+if __name__ == "__main__":
+    print("psq_retrieval.py imports ok")

psq_rag/retrieval/state.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from __future__ import annotations
+import csv
+import logging
+import pathlib
+from typing import Any, Dict, List, Optional, Set, Tuple
+import joblib
+import numpy as np
+try:
+    import hnswlib
+except Exception:
+    hnswlib = None  # allow import on environments without hnswlib during partial tests
+TFIDF_PATH = pathlib.Path("tf_idf_files_420.joblib")
+NSFW_CSV_PATH = pathlib.Path("word_rating_probabilities.csv")
+NSFW_THRESHOLD = 0.95
+HNSW_ART_PATH = pathlib.Path("tfidf_hnsw_artists.bin")
+HNSW_TAG_PATH = pathlib.Path("tfidf_hnsw_tags.bin")
+FASTTEXT_MODEL_PATH = pathlib.Path("e621FastTextModel010Replacement_small.bin")
+TAG_ALIASES_PATH = pathlib.Path("fluffyrock_3m.csv")
+_tfidf_components: Optional[Dict[str, Any]] = None
+_nsfw_tags: Optional[Set[str]] = None
+_artist_set: Optional[Set[str]] = None
+_fasttext_model: Optional[Any] = None
+_tag_counts: Optional[Dict[str, int]] = None
+_tfidf_tag_vectors: Optional[Dict[str, Any]] = None
+_alias_to_tags: Optional[Dict[str, List[str]]] = None
+_tag_to_aliases: Optional[Dict[str, List[str]]] = None
+_tag_type_id: Optional[Dict[str, int]] = None
+_hnsw_tag_index: Optional["hnswlib.Index"] = None
+_hnsw_artist_index: Optional["hnswlib.Index"] = None
+_hnsw_tag_count: int = 0
+_hnsw_artist_count: int = 0
+# Tag type names inferred from e621 wiki documentation.
+# Numeric IDs come from fluffyrock_3m.csv column 1; mapping is heuristic but
+# matches observed usage on e621.
+TAG_TYPE_ID_TO_NAME: Dict[int, str] = {
+    0: "general",        # Default tag type: visible attributes, actions, objects, etc.
+    1: "artist",         # Artist tags (e.g. by_name, artist_name)
+    2: "contributor",    # Contributor tags (rare / possibly unused in this dataset)
+    3: "copyright",      # Series, franchise, or IP (e.g. pokemon, winnie_the_pooh)
+    4: "character",      # Named characters (e.g. pikachu, pinkie_pie_(mlp))
+    5: "species",        # Species tags (e.g. canine, domestic_cat)
+    6: "invalid",        # Invalid / disallowed / disambiguation-only tags
+    7: "meta",           # Meta / presentation / file / style-related tags
+}
+def _l2_normalize_rows(mat: np.ndarray) -> np.ndarray:
+    mat = np.asarray(mat, dtype=np.float32)
+    norms = np.linalg.norm(mat, axis=1, keepdims=True)
+    norms[norms == 0.0] = 1.0
+    return mat / norms
+def _clean_tag_ascii(tag: str) -> str:
+    return "".join(char for char in tag if ord(char) < 128)
+def clean_tag(tag: str) -> str:
+    """Normalize tags consistently with legacy alias parsing."""
+    return _clean_tag_ascii(tag)
+def build_aliases_dict(csv_path: str, reverse: bool = False) -> Dict[str, List[str]]:
+    """Build tag/alias mappings from the aliases CSV."""
+    aliases_dict: Dict[str, List[str]] = {}
+    with open(csv_path, "r", newline="", encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            tag = clean_tag(row[0])
+            alias_list = [] if row[3] == "null" else [clean_tag(alias) for alias in row[3].split(",")]
+            if reverse:
+                for alias in alias_list:
+                    aliases_dict.setdefault(alias, []).append(tag)
+            else:
+                aliases_dict[tag] = alias_list
+    return aliases_dict
+def get_tfidf_components() -> Dict[str, Any]:
+    global _tfidf_components
+    if _tfidf_components is not None:
+        return _tfidf_components
+    if not TFIDF_PATH.is_file():
+        raise FileNotFoundError(f"TF-IDF joblib not found: {TFIDF_PATH}")
+    model_components = joblib.load(TFIDF_PATH)
+    if "tag_to_row_index" in model_components and "row_to_tag" not in model_components:
+        model_components["row_to_tag"] = {
+            idx: tag for tag, idx in model_components["tag_to_row_index"].items()
+        }
+    idf = model_components.get("idf")
+    if isinstance(idf, dict):
+        t2c = model_components["tag_to_column_index"]
+        n_cols = max(t2c.values()) + 1
+        idf_by_col = np.ones(n_cols, dtype=np.float32)
+        for term, col in t2c.items():
+            idf_by_col[col] = float(idf.get(term, 1.0))
+        model_components["idf"] = idf_by_col
+    _tfidf_components = model_components
+    return model_components
+def get_nsfw_tags() -> Set[str]:
+    global _nsfw_tags
+    if _nsfw_tags is not None:
+        return _nsfw_tags
+    if not NSFW_CSV_PATH.is_file():
+        raise FileNotFoundError(f"NSFW tag CSV not found: {NSFW_CSV_PATH}")
+    tags: Set[str] = set()
+    with NSFW_CSV_PATH.open("r", newline="", encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        next(reader, None)
+        for row in reader:
+            if not row:
+                continue
+            word = row[0]
+            try:
+                probability_sum = float(row[1])
+            except (IndexError, ValueError):
+                continue
+            if probability_sum >= NSFW_THRESHOLD:
+                tags.add(word)
+    _nsfw_tags = tags
+    return _nsfw_tags
+def get_artist_set() -> Set[str]:
+    global _artist_set
+    if _artist_set is not None:
+        return _artist_set
+    path = pathlib.Path("fluffyrock_3m.csv")
+    if not path.is_file():
+        _artist_set = set()
+        return _artist_set
+    artists: Set[str] = set()
+    with path.open("r", newline="", encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            if not row:
+                continue
+            tag_name = row[0]
+            if tag_name.startswith("by_"):
+                artists.add(tag_name[3:])
+    _artist_set = artists
+    return _artist_set
+def is_artist(name: str) -> bool:
+    return name in get_artist_set()
+def get_fasttext_model() -> Any:
+    global _fasttext_model
+    if _fasttext_model is not None:
+        return _fasttext_model
+    if not FASTTEXT_MODEL_PATH.is_file():
+        raise FileNotFoundError(f"FastText model not found: {FASTTEXT_MODEL_PATH}")
+    import compress_fasttext
+    _fasttext_model = compress_fasttext.models.CompressedFastTextKeyedVectors.load(
+        str(FASTTEXT_MODEL_PATH)
+    )
+    return _fasttext_model
+def get_tag_type_ids() -> Dict[str, int]:
+    """Return canonical tag -> type_id (int) from fluffyrock_3m.csv.
+    Reads row[1] as int when possible. Missing/invalid values are skipped.
+    """
+    global _tag_type_id
+    if _tag_type_id is not None:
+        return _tag_type_id
+    if not TAG_ALIASES_PATH.is_file():
+        raise FileNotFoundError(f"Tag CSV not found: {TAG_ALIASES_PATH}")
+    m: Dict[str, int] = {}
+    with TAG_ALIASES_PATH.open("r", newline="", encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            if not row:
+                continue
+            tag = clean_tag(row[0])
+            if len(row) < 2:
+                continue
+            try:
+                type_id = int(row[1])
+            except ValueError:
+                continue
+            m[tag] = type_id
+    _tag_type_id = m
+    return _tag_type_id
+def get_tag_type_name(tag: str) -> Optional[str]:
+    """Return heuristic type name for a tag (e.g. 'artist', 'character'), or None."""
+    tid = get_tag_type_ids().get(clean_tag(tag))
+    if tid is None:
+        return None
+    return TAG_TYPE_ID_TO_NAME.get(tid, f"type_{tid}")
+def get_tag_counts() -> Dict[str, int]:
+    global _tag_counts
+    if _tag_counts is not None:
+        return _tag_counts
+    if not TAG_ALIASES_PATH.is_file():
+        raise FileNotFoundError(f"Tag count CSV not found: {TAG_ALIASES_PATH}")
+    tag_counts: Dict[str, int] = {}
+    with TAG_ALIASES_PATH.open("r", newline="", encoding="utf-8") as csvfile:
+        reader = csv.reader(csvfile)
+        for row in reader:
+            if not row:
+                continue
+            key = row[0]
+            value = int(row[2]) if row[2].isdigit() else None
+            if value is not None:
+                tag_counts[key] = value
+    _tag_counts = tag_counts
+    return _tag_counts
+def get_alias2tags() -> Dict[str, List[str]]:
+    """Return alias -> [canonical tags] mapping."""
+    global _alias_to_tags
+    if _alias_to_tags is not None:
+        return _alias_to_tags
+    if not TAG_ALIASES_PATH.is_file():
+        raise FileNotFoundError(f"Tag alias CSV not found: {TAG_ALIASES_PATH}")
+    _alias_to_tags = build_aliases_dict(str(TAG_ALIASES_PATH), reverse=True)
+    return _alias_to_tags
+def get_tag2aliases() -> Dict[str, List[str]]:
+    """Return canonical tag -> [aliases] mapping."""
+    global _tag_to_aliases
+    if _tag_to_aliases is not None:
+        return _tag_to_aliases
+    if not TAG_ALIASES_PATH.is_file():
+        raise FileNotFoundError(f"Tag alias CSV not found: {TAG_ALIASES_PATH}")
+    _tag_to_aliases = build_aliases_dict(str(TAG_ALIASES_PATH), reverse=False)
+    return _tag_to_aliases
+def get_tfidf_tag_vectors() -> Dict[str, Any]:
+    global _tfidf_tag_vectors
+    if _tfidf_tag_vectors is not None:
+        return _tfidf_tag_vectors
+    components = get_tfidf_components()
+    reduced_matrix = components.get("reduced_matrix")
+    if reduced_matrix is None:
+        raise KeyError("TF-IDF components missing reduced_matrix")
+    row_to_tag = components.get("row_to_tag")
+    if row_to_tag is None and "tag_to_row_index" in components:
+        row_to_tag = {idx: tag for tag, idx in components["tag_to_row_index"].items()}
+    if row_to_tag is None:
+        raise KeyError("TF-IDF components missing row_to_tag mapping")
+    tag_to_row_index = components.get("tag_to_row_index")
+    if tag_to_row_index is None:
+        tag_to_row_index = {tag: idx for idx, tag in row_to_tag.items()}
+    reduced_matrix_norm = _l2_normalize_rows(reduced_matrix).astype(np.float32)
+    _tfidf_tag_vectors = {
+        "reduced_matrix": reduced_matrix,
+        "reduced_matrix_norm": reduced_matrix_norm,
+        "row_to_tag": row_to_tag,
+        "tag_to_row_index": tag_to_row_index,
+    }
+    return _tfidf_tag_vectors
+def retrieval_assets_status() -> Dict[str, bool]:
+    return {
+        "tfidf": TFIDF_PATH.is_file(),
+        "nsfw_csv": NSFW_CSV_PATH.is_file(),
+        "fasttext_model": FASTTEXT_MODEL_PATH.is_file(),
+        "tag_aliases_csv": TAG_ALIASES_PATH.is_file(),
+        "hnsw_tags": HNSW_TAG_PATH.is_file(),
+        "hnsw_artists": HNSW_ART_PATH.is_file(),
+    }
+def _build_or_load_index(path: pathlib.Path, rows: list[int], rm: np.ndarray, dim: int) -> "hnswlib.Index":
+    idx = hnswlib.Index(space="cosine", dim=dim)
+    need_build = True
+    if path.exists():
+        try:
+            idx.load_index(str(path), max_elements=max(1, len(rows)))
+            if getattr(idx, "get_current_count", None) and idx.get_current_count() == len(rows) and len(rows) > 0:
+                need_build = False
+            else:
+                logging.debug(
+                    "Rebuilding %s: saved_count!=rows_len (%s vs %s)",
+                    path.name,
+                    idx.get_current_count(),
+                    len(rows),
+                )
+        except Exception as e:
+            logging.debug("Reload %s failed, rebuilding: %s", path.name, e)
+    if need_build:
+        try:
+            if path.exists():
+                path.unlink()
+        except Exception:
+            pass
+        idx.init_index(max_elements=max(1, len(rows)), ef_construction=200, M=16)
+        if rows:
+            idx.add_items(rm[rows], ids=np.asarray(rows, dtype=np.int32))
+        idx.save_index(str(path))
+    idx.set_ef(200)
+    return idx
+def _ensure_hnsw_indexes(need_artists: bool) -> None:
+    global _hnsw_tag_index, _hnsw_artist_index, _hnsw_tag_count, _hnsw_artist_count
+    if hnswlib is None:
+        return
+    if _hnsw_tag_index is not None and (not need_artists or _hnsw_artist_index is not None):
+        return
+    components = get_tfidf_components()
+    reduced_matrix = components["reduced_matrix"]
+    row_to_tag = components["row_to_tag"]
+    rm = _l2_normalize_rows(reduced_matrix).astype(np.float32)
+    n_items, dim = rm.shape
+    artist_set = get_artist_set() if need_artists else set()
+    artist_rows: list[int] = []
+    tag_rows: list[int] = []
+    for i in range(n_items):
+        tag = row_to_tag.get(i, "")
+        base = tag[3:] if tag.startswith("by_") else tag
+        if tag in {"by_unknown_artist", "by_conditional_dnp"}:
+            tag_rows.append(i)
+            continue
+        if artist_set and is_artist(base):
+            artist_rows.append(i)
+        else:
+            tag_rows.append(i)
+    _hnsw_tag_index = _build_or_load_index(HNSW_TAG_PATH, tag_rows, rm, dim)
+    _hnsw_tag_count = len(tag_rows)
+    if need_artists:
+        _hnsw_artist_index = _build_or_load_index(HNSW_ART_PATH, artist_rows, rm, dim)
+        _hnsw_artist_count = len(artist_rows)
+def get_hnsw_tag_index() -> Tuple[Optional["hnswlib.Index"], int]:
+    _ensure_hnsw_indexes(need_artists=False)
+    return _hnsw_tag_index, _hnsw_tag_count
+def get_hnsw_artist_index() -> Tuple[Optional["hnswlib.Index"], int]:
+    _ensure_hnsw_indexes(need_artists=True)
+    return _hnsw_artist_index, _hnsw_artist_count

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio==4.44.1
+gradio-client==1.3.0
+hnswlib==0.8.0
+numpy==1.25.1
+scikit-learn==1.4.1.post1
+h5py==3.8.0
+joblib==1.2.0
+compress-fasttext
+lark-parser
+scipy==1.12.0
+gensim==4.3.2
+huggingface_hub<1.0
+rapidfuzz>=3.0

scripts/extract_tag_patterns.py ADDED Viewed

	@@ -0,0 +1,272 @@

+#!/usr/bin/env python3
+"""
+Extract syntactic / compositional patterns from an e621-style tag CSV.
+Assumptions:
+- Input is a CSV (or TSV) where the FIRST column contains the tag string.
+- Tags are typically underscore-delimited, e.g. "blue_shirt", "looking_at_viewer".
+- We want PATTERN statistics, not just top tags.
+Outputs:
+- Top underscore-shape templates (e.g., "<w>_<w>", "<w>_<w>_<w>")
+- Top "suffix patterns" (e.g., "<w>_shirt", "<w>_fur") that catch color+object style combos
+- Top "prefix patterns" (e.g., "looking_<w>_<w>")
+- Heuristic slot-typed templates (e.g., "<color>_<clothing>") based on small dictionaries
+No third-party deps.
+"""
+import argparse
+import csv
+import os
+import random
+import re
+import sys
+from collections import Counter, defaultdict
+from typing import Dict, Iterable, List, Tuple
+# ---- small heuristic lexicons (edit freely) ----
+# Keep these small: they are for *pattern discovery*, not canonicalization.
+COLORS = {
+    "black","white","grey","gray","red","blue","green","yellow","orange","purple","pink","brown","tan",
+    "silver","gold","blonde","blond","aqua","teal","cyan","magenta","violet","indigo","maroon","navy",
+    "beige","cream","ivory","turquoise","lavender",
+    "multicolored","two_tone","two_toned",
+}
+# Clothing nouns seen frequently in tag vocabularies (add as you notice them).
+CLOTHING = {
+    "shirt","pants","shorts","dress","skirt","underwear","panties","bra","bikini","swimwear",
+    "topwear","bottomwear","legwear","handwear","armwear","footwear","stockings","socks","shoes","boots",
+    "gloves","hat","headwear","headgear","collar","armor","mask",
+}
+BODY = {
+    "fur","hair","eyes","tail","ears","horn","wings","paws","toes","fingers","nipples","breasts",
+    "belly","butt","penis","pussy","anus","clitoris","hooves","teeth","fangs","tongue","nose",
+}
+VIEW_WORDS = {
+    "front","rear","side","back","from","first","third"
+}
+# ---- helpers ----
+def detect_dialect(path: str) -> csv.Dialect:
+    # Very small sniff. If it fails, fall back to comma.
+    with open(path, "r", encoding="utf-8", newline="") as f:
+        sample = f.read(4096)
+    try:
+        return csv.Sniffer().sniff(sample, delimiters=[",", "\t", ";", "|"])
+    except Exception:
+        return csv.get_dialect("excel")
+def iter_tags_from_first_col(path: str, sample_n: int | None, seed: int) -> Iterable[str]:
+    dialect = detect_dialect(path)
+    rng = random.Random(seed)
+    # If sampling, do reservoir sampling so we don't load the whole file.
+    reservoir: List[str] = []
+    seen = 0
+    with open(path, "r", encoding="utf-8", newline="") as f:
+        reader = csv.reader(f, dialect=dialect)
+        for row in reader:
+            if not row:
+                continue
+            tag = row[0].strip()
+            if not tag:
+                continue
+            # skip a header row if it looks like one
+            if seen == 0 and tag.lower() in {"tag", "tags", "name"}:
+                seen += 1
+                continue
+            seen += 1
+            if sample_n is None:
+                yield tag
+            else:
+                # reservoir sampling
+                if len(reservoir) < sample_n:
+                    reservoir.append(tag)
+                else:
+                    j = rng.randrange(seen)
+                    if j < sample_n:
+                        reservoir[j] = tag
+    if sample_n is not None:
+        for t in reservoir:
+            yield t
+_word_re = re.compile(r"^[a-z0-9]+(?:[/-][a-z0-9]+)*$")
+def normalize_tag(tag: str) -> str:
+    # keep underscores; strip whitespace; lowercase.
+    return tag.strip().lower()
+def split_parts(tag: str) -> List[str]:
+    return [p for p in tag.split("_") if p]
+def underscore_shape(parts: List[str]) -> str:
+    # e.g. 1 part -> "<w>"
+    return "_".join(["<w>"] * len(parts))
+def suffix_pattern(parts: List[str], k: int = 1) -> str | None:
+    # "<w>_<w>_shirt" style: wildcard prefix + suffix tokens.
+    if len(parts) <= k:
+        return None
+    suf = "_".join(parts[-k:])
+    return "<w>_" + suf if k == 1 else "<w>..._" + suf
+def prefix_pattern(parts: List[str], k: int = 1) -> str | None:
+    # "looking_<w>_<w>" style: prefix token(s) + wildcard suffix
+    if len(parts) <= k:
+        return None
+    pre = "_".join(parts[:k])
+    return pre + "_<w>" if k == 1 else pre + "_<w>..."
+def typed_token(tok: str) -> str:
+    # heuristic slot typing
+    if tok.isdigit():
+        return "<num>"
+    if tok in COLORS:
+        return "<color>"
+    if tok in CLOTHING:
+        return "<clothing>"
+    if tok in BODY:
+        return "<body>"
+    if tok in {"male","female","intersex","gynomorph","ambiguous_gender"}:
+        return "<gender>"
+    return "<w>"
+def typed_template(parts: List[str]) -> str:
+    return "_".join(typed_token(p) for p in parts)
+def bigram_templates(parts: List[str]) -> List[str]:
+    # adjacent pair templates: useful for color+thing detection even if full tag is longer
+    out = []
+    for a, b in zip(parts, parts[1:]):
+        out.append(f"{typed_token(a)}_{typed_token(b)}")
+    return out
+def print_counter(title: str, c: Counter, top: int, min_count: int) -> None:
+    print("\n" + title)
+    print("-" * len(title))
+    shown = 0
+    for key, val in c.most_common():
+        if val < min_count:
+            break
+        print(f"{val:>8}  {key}")
+        shown += 1
+        if shown >= top:
+            break
+    if shown == 0:
+        print("(no entries above min_count)")
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("csv_path", help="Path to CSV/TSV; first column is tag")
+    ap.add_argument("--top", type=int, default=100, help="Top N patterns to print per section")
+    ap.add_argument("--min-count", type=int, default=25, help="Minimum count to show")
+    ap.add_argument("--sample", type=int, default=None, help="Reservoir sample N tags instead of full file")
+    ap.add_argument("--seed", type=int, default=0, help="RNG seed for sampling")
+    ap.add_argument("--max-rows", type=int, default=None, help="Hard stop after reading this many rows (debug)")
+    args = ap.parse_args()
+    path = args.csv_path
+    if not os.path.exists(path):
+        print(f"ERROR: file not found: {path}", file=sys.stderr)
+        sys.exit(1)
+    shape_counts = Counter()
+    typed_counts = Counter()
+    suffix1_counts = Counter()
+    suffix2_counts = Counter()
+    prefix1_counts = Counter()
+    prefix2_counts = Counter()
+    bigram_typed_counts = Counter()
+    token_counts = Counter()
+    length_counts = Counter()
+    prefix_head_counts = defaultdict(Counter)
+    read = 0
+    for raw_tag in iter_tags_from_first_col(path, args.sample, args.seed):
+        tag = normalize_tag(raw_tag)
+        parts = split_parts(tag)
+        if not parts:
+            continue
+        if len(parts) >= 2:
+            prefix = parts[0]
+            head = parts[1]
+            prefix_head_counts[prefix][head] += 1
+        read += 1
+        if args.max_rows is not None and read > args.max_rows:
+            break
+        length_counts[len(parts)] += 1
+        token_counts.update(parts)
+        shape_counts[underscore_shape(parts)] += 1
+        typed_counts[typed_template(parts)] += 1
+        bigram_typed_counts.update(bigram_templates(parts))
+        sp1 = suffix_pattern(parts, k=1)
+        if sp1: suffix1_counts[sp1] += 1
+        sp2 = suffix_pattern(parts, k=2)
+        if sp2: suffix2_counts[sp2] += 1
+        pp1 = prefix_pattern(parts, k=1)
+        if pp1: prefix1_counts[pp1] += 1
+        pp2 = prefix_pattern(parts, k=2)
+        if pp2: prefix2_counts[pp2] += 1
+    print(f"Read {read} tags from {path}")
+    print_counter("Tag length distribution (#parts)", length_counts, top=50, min_count=1)
+    print_counter("Top underscore-shapes", shape_counts, top=args.top, min_count=args.min_count)
+    # Typed templates will be sparse if lexicons are small; still useful.
+    print_counter("Top typed templates (heuristic)", typed_counts, top=args.top, min_count=args.min_count)
+    # Bigrams are the best way to surface “collectively important” schemas.
+    print_counter("Top typed bigrams (heuristic, adjacent parts)", bigram_typed_counts, top=args.top, min_count=args.min_count)
+    # Suffix patterns show color+THING and modifier+THING tendencies.
+    print_counter("Top suffix patterns (last token)", suffix1_counts, top=args.top, min_count=args.min_count)
+    print_counter("Top suffix patterns (last 2 tokens)", suffix2_counts, top=args.top, min_count=args.min_count)
+    # Prefix patterns show looking_* and similar families.
+    print_counter("Top prefix patterns (first token)", prefix1_counts, top=args.top, min_count=args.min_count)
+    print_counter("Top prefix patterns (first 2 tokens)", prefix2_counts, top=args.top, min_count=args.min_count)
+    # Show the most common tokens too (useful for expanding lexicons).
+    print_counter("Top tokens (raw parts)", token_counts, top=200, min_count=max(args.min_count, 100))
+    def print_prefix_families(prefix_head_counts, top_prefixes=50, top_heads=15, min_prefix_count=100):
+        print("\nPrefix -> Head Families")
+        print("----------------------")
+        # rank prefixes by total usage
+        prefix_totals = {
+            p: sum(heads.values())
+            for p, heads in prefix_head_counts.items()
+        }
+        for prefix, total in sorted(prefix_totals.items(), key=lambda x: -x[1]):
+            if total < min_prefix_count:
+                break
+            print(f"\nPREFIX: {prefix}  (total={total})")
+            for head, cnt in prefix_head_counts[prefix].most_common(top_heads):
+                print(f"  {head:<20} {cnt}")
+    print_prefix_families(prefix_head_counts)
+if __name__ == "__main__":
+    main()

scripts/rewrite_playground.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import argparse
+import json
+import os
+from pathlib import Path
+import requests
+CAPTION_FIELDS = ["caption_llm_4", "caption_llm_6", "caption_cogvlm"]
+# Start with something minimal. You will iterate this.
+REWRITE_SYSTEM = """Rewrite the input into a concise, comma-separated list of short phrases
+that resemble image tags.
+Use short, literal phrases that reflect how visual concepts are commonly
+written in image tag vocabularies.
+Multi-word phrases are appropriate when they represent one coherent
+visual idea.
+Examples of tag-shaped phrases:
+- wolf, angry
+- blue jacket, striped tail
+- long hair, raised ears
+- holding object, hand on shoulder
+- looking at viewer, looking down
+- simple background, outdoor scene
+- wooden table, plant
+- running, sleeping
+- smiling, angry expression
+- bedroom, forest
+- sonic the hedgehog, princess peach
+Do not invent details or guess identities.
+Do not infer demographic attributes (e.g., gender/age) unless explicitly stated.
+Output ONLY the rewritten list."""
+def load_jsonl(path: Path):
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            yield json.loads(line)
+def openrouter_chat(model: str, system: str, user: str, temperature: float = 0.0, max_tokens: int = 200) -> str:
+    api_key = os.environ.get("OPENROUTER_API_KEY")
+    if not api_key:
+        raise RuntimeError("Set OPENROUTER_API_KEY in your environment.")
+    url = "https://openrouter.ai/api/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "model": model,
+        "temperature": temperature,
+        "max_tokens": max_tokens,
+        "messages": [
+            {"role": "system", "content": system},
+            {"role": "user", "content": user},
+        ],
+    }
+    r = requests.post(url, headers=headers, json=payload, timeout=60)
+    r.raise_for_status()
+    data = r.json()
+    return data["choices"][0]["message"]["content"].strip()
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Interactive prompt **query rewriting** playground.")
+    ap.add_argument("--sample", type=str, required=True, help="Path to the trimmed JSONL sample.")
+    ap.add_argument("--field", type=str, default="caption_llm_6", choices=CAPTION_FIELDS)
+    ap.add_argument("--model", type=str, default="meta-llama/llama-3.1-8b-instruct")
+    ap.add_argument("--temperature", type=float, default=0.0)
+    ap.add_argument("--max-tokens", type=int, default=200)
+    ap.add_argument("--start", type=int, default=0, help="Index to start from within the loaded examples.")
+    args = ap.parse_args()
+    rows = []
+    for row in load_jsonl(Path(args.sample)):
+        text = (row.get(args.field) or "").strip()
+        if text:
+            gt = row.get("tags_ground_truth_categorized")
+            rows.append((str(row["id"]), text, gt))
+    if not rows:
+        raise RuntimeError(f"No non-empty rows found for field={args.field}")
+    print(f"Loaded {len(rows)} examples from {args.sample} using {args.field}.")
+    print("Commands: [Enter]=next  |  r=rerun current (same input)  |  q=quit\n")
+    if args.start < 0 or args.start >= len(rows):
+        raise ValueError(f"--start must be in [0, {len(rows)-1}] but got {args.start}")
+    idx = args.start
+    while True:
+        row_id, prompt, gt = rows[idx]
+        print("=" * 80)
+        print(f"row_id: {row_id}")
+        print(f"ORIGINAL:\n{prompt}\n")
+        rewritten = openrouter_chat(
+            model=args.model,
+            system=REWRITE_SYSTEM,
+            user=prompt,
+            temperature=args.temperature,
+            max_tokens=args.max_tokens,
+        )
+        print(f"REWRITE:\n{rewritten}\n")
+        if gt:
+            gt_dict = json.loads(gt)
+            flat_gt = sorted({tag for tags in gt_dict.values() for tag in tags})
+            print(f"GROUND TRUTH TAGS:\n{', '.join(flat_gt)}\n")
+        cmd = input("> ").strip().lower()
+        if cmd == "q":
+            break
+        if cmd == "r":
+            continue
+        idx += 1
+        if idx >= len(rows):
+            print("End of samples.")
+            break
+if __name__ == "__main__":
+    main()

scripts/sample_dataset_streaming.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import argparse
+import json
+from pathlib import Path
+from datasets import load_dataset
+DATASET_ID = "CaptionEmporium/furry-e621-sfw-7m-hq"
+SPLIT = "train"
+# Adjust these names if your actual columns differ.
+CAPTION_FIELDS = ["caption_llm_6", "caption_llm_8", "caption_cogvlm"]
+KEEP_FIELDS = ["tags_ground_truth_categorized"] + CAPTION_FIELDS
+def pick_id(row: dict) -> str:
+    # Try a few common id keys; fall back to a hash-like stable string.
+    for k in ("id", "post_id", "e621_id", "image_id"):
+        if k in row and row[k] not in (None, ""):
+            return str(row[k])
+    # As a fallback, derive a stable-ish id from caption text.
+    base = (row.get("caption_llm_6") or row.get("caption_llm_8") or row.get("caption_cogvlm") or "")
+    return f"no_id:{hash(base)}"
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Stream+shuffle sample and save a trimmed JSONL for prompt experiments.")
+    ap.add_argument("--n", type=int, default=1000)
+    ap.add_argument("--seed", type=int, default=123)
+    ap.add_argument("--buffer-size", type=int, default=10_000)
+    ap.add_argument(
+        "--out",
+        type=str,
+        default="data/eval_samples/e621_sfw_sample_1000_seed123_buffer10000_trimmed.jsonl",
+    )
+    ap.add_argument(
+        "--require-any-caption",
+        action="store_true",
+        help="If set, only keep rows where at least one of the caption fields is non-empty.",
+    )
+    args = ap.parse_args()
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    ds = load_dataset(DATASET_ID, split=SPLIT, streaming=True)
+    ds = ds.shuffle(seed=args.seed, buffer_size=args.buffer_size)
+    wrote = 0
+    with out_path.open("w", encoding="utf-8") as f:
+        for row in ds:
+            out = {"row_id": pick_id(row)}
+            for k in KEEP_FIELDS:
+                out[k] = row.get(k, "")
+            if args.require_any_caption:
+                if not any((out.get(c) or "").strip() for c in CAPTION_FIELDS):
+                    continue
+            f.write(json.dumps(out, ensure_ascii=False) + "\n")
+            wrote += 1
+            if wrote >= args.n:
+                break
+    print(f"Wrote {wrote} rows to: {out_path}")
+if __name__ == "__main__":
+    main()

scripts/smoke_test.py ADDED Viewed

	@@ -0,0 +1,159 @@

+from pathlib import Path
+import sys
+import traceback
+import os
+# Add repo root (parent of /scripts) to sys.path
+repo_root = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(repo_root))
+os.chdir(repo_root)
+def main():
+    from psq_rag.llm.rewrite import llm_rewrite_prompt
+    from psq_rag.retrieval.psq_retrieval import (
+        psq_candidates_from_rewrite_phrases,
+    )
+    from psq_rag.retrieval.state import (
+        get_artist_set,
+        get_nsfw_tags,
+    )
+    def log(x=""):
+        print(x)
+    def assert_true(condition, message):
+        if not condition:
+            raise AssertionError(message)
+    def print_failure(message, exc):
+        log(f"FAIL: {message}")
+        if exc is not None:
+            for line in traceback.format_exception_only(type(exc), exc):
+                log(line.rstrip())
+    def import_sanity():
+        try:
+            __import__("psq_rag.retrieval.state")
+            __import__("psq_rag.retrieval.psq_retrieval")
+            __import__("psq_rag.parsing.prompt_grammar")
+            __import__("psq_rag.llm.rewrite")
+            import app
+            log("import sanity: ok")
+        except Exception as e:
+            log(f"import sanity: {type(e).__name__}: {e}")
+    import_sanity()
+    stage2_only = "--stage2-only" in sys.argv
+    if not stage2_only:
+        prompt = "ape, raised arms, looking at viewer"
+        rewrite = llm_rewrite_prompt(prompt, log)
+        if rewrite:
+            print("rewrite:", rewrite)
+        else:
+            log("LLM rewrite: no result (continuing)")
+    def run_stage2_test_a():
+        phrases = ["big shirt", "grey shirt"]
+        cands, per_phrase = psq_candidates_from_rewrite_phrases(
+            rewrite_phrases=phrases,
+            allow_nsfw_tags=True,
+            verbose=True,
+            global_k=300,
+            per_phrase_k=50,
+            per_phrase_final_k=10,
+        )
+        print("cands:", len(cands))
+        assert_true(isinstance(per_phrase, list), "per_phrase must be a list")
+        phrase_set = {report.get("phrase") for report in per_phrase}
+        assert_true("big shirt" in phrase_set, "per_phrase missing entry for 'big shirt'")
+        assert_true("grey shirt" in phrase_set, "per_phrase missing entry for 'grey shirt'")
+        assert_true("shirt" in phrase_set, "per_phrase missing head-noun expansion for 'shirt'")
+        required_report_keys = {"phrase", "normalized", "lookup", "tfidf_vocab", "oov_terms", "candidates"}
+        required_row_keys = {
+            "tag",
+            "alias_token",
+            "score_fasttext",
+            "score_context",
+            "score_combined",
+            "context_imputed",
+            "count",
+        }
+        for report in per_phrase:
+            assert_true(required_report_keys.issubset(report.keys()), "per_phrase missing required keys")
+            rows = report.get("candidates", [])
+            assert_true(isinstance(rows, list), "per_phrase candidates must be a list")
+            for row in rows:
+                assert_true(required_row_keys.issubset(row.keys()), "candidate row missing required keys")
+        big_report = None
+        for report in per_phrase:
+            if report.get("phrase") == "big shirt":
+                big_report = report
+                break
+        assert_true(big_report is not None, "no per_phrase report found for 'big shirt'")
+        big_tags = {row.get("tag") for row in big_report.get("candidates", [])}
+        assert_true("big_shirt" in big_tags, "big_shirt missing from per_phrase_final_k for 'big shirt'")
+        log("stage2-only test A: PASS")
+    def run_stage2_test_b():
+        phrases = ["anuss"]
+        result_unfiltered = psq_candidates_from_rewrite_phrases(
+            rewrite_phrases=phrases,
+            allow_nsfw_tags=True,
+            verbose=False,
+            global_k=300,
+            per_phrase_k=50,
+            per_phrase_final_k=10,
+        )
+        result_filtered = psq_candidates_from_rewrite_phrases(
+            rewrite_phrases=phrases,
+            allow_nsfw_tags=False,
+            verbose=False,
+            global_k=300,
+            per_phrase_k=50,
+            per_phrase_final_k=10,
+        )
+        cands_unfiltered = result_unfiltered[0] if isinstance(result_unfiltered, tuple) else result_unfiltered
+        cands_filtered = result_filtered[0] if isinstance(result_filtered, tuple) else result_filtered
+        def extract_tag(row):
+            if hasattr(row, "get"):
+                return row.get("tag")
+            return getattr(row, "tag", None)
+        unfiltered_tags = {extract_tag(row) for row in cands_unfiltered}
+        filtered_tags = {extract_tag(row) for row in cands_filtered}
+        assert_true("anus" in unfiltered_tags, "anus missing from unfiltered candidates")
+        assert_true("anus" not in filtered_tags, "anus unexpectedly present in filtered candidates")
+        log(f"stage2-only test B: PASS (anus in unfiltered={ 'anus' in unfiltered_tags }, in filtered={ 'anus' in filtered_tags })")
+    if stage2_only:
+        try:
+            run_stage2_test_a()
+            run_stage2_test_b()
+        except AssertionError as exc:
+            print_failure("stage2 contract assertion failed", exc)
+            sys.exit(1)
+        return
+    # Artist set check (optional in RAG mode)
+    try:
+        artists = get_artist_set()
+        log(f"artist set size: {len(artists)}")
+    except Exception as e:
+        log(f"artist set: {type(e).__name__}: {e}")
+    try:
+        nsfw_tags = get_nsfw_tags()
+        log(f"nsfw tag count: {len(nsfw_tags)}")
+    except Exception as e:
+        log(f"nsfw tags: {type(e).__name__}: {e}")
+if __name__ == "__main__":
+    main()

scripts/stage3_debug.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""Stage 3 debug harness.
+Goal: Run a realistic Stage 3 selection loop without manually typing hundreds of candidates.
+Typical usage (bypass Stage 1):
+  python scripts/stage3_debug.py --prompt "..." --phrases "a, b, c" \
+      --no-allow-nsfw --mode chunked_map_union --chunk-size 60 --per-phrase-k 2
+If --phrases is omitted, the script uses a simple fallback: it treats the prompt as a
+comma-separated list of phrases (useful for quick tests).
+Outputs:
+- Stage 2 candidate stats
+- Stage 3 per-call config + validation diagnostics (via the selector's log hook)
+- Final selected tags
+NOTE: This script expects your project package imports to work (run from repo root).
+"""
+from __future__ import annotations
+import argparse
+import sys
+from pathlib import Path
+from typing import Any, List, Sequence, cast
+# Ensure repo root is on sys.path when running as a script:
+#   python scripts/stage3_debug.py ...
+# This makes `import psq_rag...` work without requiring editable installs.
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+def _split_csv_phrases(s: str) -> List[str]:
+    # Very small helper: split on commas, trim, drop empties.
+    return [p.strip() for p in s.split(",") if p.strip()]
+def _import_stage2_entrypoint():
+    """Import Stage 2 entrypoints.
+    This harness supports two paths:
+    A) End-to-end-ish (Stage 1 + Stage 2):
+       - psq_candidates_from_prompt(prompt: str, allow_nsfw_tags: bool, ...)
+    B) Bypass Stage 1 (Stage 2 only):
+       - psq_candidates_from_rewrite_phrases(rewrite_phrases: List[str], allow_nsfw_tags: bool, ...)
+    We import both and choose at runtime depending on whether --phrases was provided.
+    """
+    import psq_rag.retrieval.psq_retrieval as m
+    fn_prompt = getattr(m, "psq_candidates_from_prompt", None)
+    fn_phrases = getattr(m, "psq_candidates_from_rewrite_phrases", None)
+    if callable(fn_prompt) or callable(fn_phrases):
+        return fn_prompt, fn_phrases
+    # Older naming (very old code paths):
+    fn = getattr(m, "psq_candidates", None)
+    if callable(fn):
+        # Use the same function for both paths
+        return fn, fn
+    # Fail loudly with guidance.
+    public = [
+        name
+        for name, obj in vars(m).items()
+        if callable(obj) and not name.startswith("_")
+    ]
+    raise RuntimeError(
+        "Expected Stage 2 function psq_candidates(...) in psq_rag.retrieval.psq_retrieval, "
+        "but it was not found.\n"
+        "Public callables in that module:\n  - "
+        + "\n  - ".join(sorted(public))
+        + "\n\nUpdate _import_stage2_entrypoint() in scripts/stage3_debug.py to use the correct function."
+    )
+def _import_stage3_selector():
+    from psq_rag.llm.select import llm_select_indices, _split_candidates_by_type
+    return llm_select_indices, _split_candidates_by_type
+def _import_stage1_rewrite():
+    from psq_rag.llm.rewrite import llm_rewrite_prompt
+    return llm_rewrite_prompt
+def _as_list_candidates(stage2_result: Any):
+    """Normalize various plausible Stage 2 return shapes into (candidates, aux).
+    Common patterns we've used in this repo:
+      - candidates
+      - (candidates, verbose_rows)
+      - (candidates, anything_else, ...)
+      - {"candidates": [...], ...}
+    """
+    if isinstance(stage2_result, dict) and "candidates" in stage2_result:
+        return stage2_result["candidates"], stage2_result
+    if isinstance(stage2_result, tuple) and len(stage2_result) >= 1:
+        return stage2_result[0], stage2_result
+    return stage2_result, None
+def _safe_tag_display(tag: str) -> str:
+    return tag.replace("_", " ")
+def _print_top_candidates(cands: Sequence[Any], n: int) -> None:
+    # Candidate is a dataclass-like object with fields: tag, score_combined, count, sources.
+    # We do NOT print sources/count too noisily; this is just a quick glance.
+    print(f"\nTop {min(n, len(cands))} candidates (by score_combined, then count):")
+    def key(c):
+        sc = getattr(c, "score_combined", 0.0)
+        ct = getattr(c, "count", None)
+        return (sc, ct if ct is not None else -1)
+    for i, c in enumerate(sorted(cands, key=key, reverse=True)[:n], start=1):
+        tag = getattr(c, "tag", str(c))
+        sc = getattr(c, "score_combined", None)
+        ct = getattr(c, "count", None)
+        sc_s = f"{sc:.4f}" if isinstance(sc, (float, int)) else "?"
+        ct_s = str(ct) if ct is not None else "?"
+        print(f"  {i:>2}. {_safe_tag_display(tag)}   score={sc_s}   count={ct_s}")
+def _describe_candidate_sample(cands: Sequence[Any], n: int = 5) -> None:
+    print(f"\nCandidate contract sample (first {min(n, len(cands))}):")
+    for i, c in enumerate(cands[:n], start=1):
+        if hasattr(c, "tag"):
+            tag = getattr(c, "tag", None)
+            sc = getattr(c, "score_combined", None)
+            sf = getattr(c, "score_fasttext", None)
+            sx = getattr(c, "score_context", None)
+            ct = getattr(c, "count", None)
+            src = getattr(c, "sources", None)
+            print(
+                "  "
+                f"{i}. type={type(c).__name__} "
+                f"tag={tag!r} "
+                f"score_combined={sc!r}({type(sc).__name__}) "
+                f"score_fasttext={sf!r}({type(sf).__name__}) "
+                f"score_context={sx!r}({type(sx).__name__}) "
+                f"count={ct!r}({type(ct).__name__}) "
+                f"sources={src!r}"
+            )
+        elif isinstance(c, (list, tuple)):
+            parts = list(c)[:3]
+            parts_t = [type(p).__name__ for p in parts]
+            print(f"  {i}. type={type(c).__name__} head={parts!r} types={parts_t}")
+        else:
+            print(f"  {i}. type={type(c).__name__} value={c!r}")
+def main(argv: Sequence[str] | None = None) -> int:
+    ap = argparse.ArgumentParser(description="Stage 3 debug harness (Stage2 -> Stage3)")
+    ap.add_argument(
+        "--prompt",
+        required=True,
+        help="Image description (original prompt).",
+    )
+    ap.add_argument(
+        "--phrases",
+        default="",
+        help=(
+            "Comma-separated Stage 1 rewrite phrases (bypass Stage 1). "
+            "If omitted, Stage 1 will run on --prompt via psq_candidates_from_prompt."
+        ),
+    )
+    ap.add_argument(
+        "--allow-nsfw",
+        dest="allow_nsfw",
+        action="store_true",
+        help="Allow NSFW tags to appear in Stage 2 candidates.",
+    )
+    ap.add_argument(
+        "--no-allow-nsfw",
+        dest="allow_nsfw",
+        action="store_false",
+        help="Disallow NSFW tags in Stage 2 candidates.",
+    )
+    ap.set_defaults(allow_nsfw=True)
+    ap.add_argument(
+        "--max-cands",
+        type=int,
+        default=0,
+        help="Optional: truncate Stage 2 candidate list to this many candidates (0 = no truncation).",
+    )
+    # Stage 3 knobs
+    ap.add_argument("--mode", choices=["single_shot", "chunked_map_union"], default="chunked_map_union")
+    ap.add_argument("--chunk-size", type=int, default=60)
+    ap.add_argument("--per-phrase-k", type=int, default=2)
+    ap.add_argument("--temperature", type=float, default=0.1)
+    ap.add_argument("--max-tokens", type=int, default=512)
+    ap.add_argument(
+        "--show-top",
+        type=int,
+        default=25,
+        help="Print the top-N Stage 2 candidates for a quick glance.",
+    )
+    args = ap.parse_args(list(argv) if argv is not None else None)
+    prompt = args.prompt.strip()
+    if not prompt:
+        print("--prompt must be non-empty", file=sys.stderr)
+        return 2
+    phrases = _split_csv_phrases(args.phrases) if args.phrases.strip() else []
+    print("Stage3 Debug")
+    print("-----------")
+    print(f"Prompt: {prompt}")
+    print(f"Phrases ({len(phrases)}): {', '.join(phrases)}")
+    print(f"allow_nsfw_tags: {args.allow_nsfw}")
+    # Stage 2
+    stage2_from_prompt, stage2_from_phrases = _import_stage2_entrypoint()
+    stage1_rewrite = _import_stage1_rewrite()
+    print("Running Stage 2 (retrieval grounding / candidate generation)...")
+    # Choose Stage 2 path based on whether --phrases was provided.
+    if phrases:
+        print("Stage 2 path: rewrite_phrases (Stage 1 bypassed)")
+        if stage2_from_phrases is None:
+            raise RuntimeError("psq_candidates_from_rewrite_phrases is not available in psq_rag.retrieval.psq_retrieval")
+        stage2_out = stage2_from_phrases(rewrite_phrases=phrases, allow_nsfw_tags=args.allow_nsfw)
+    else:
+        print("Stage 1 path: rewrite (LLM)")
+        rewritten = stage1_rewrite(prompt, log=print)
+        phrases = _split_csv_phrases(rewritten)
+        print(f"Rewrite phrases ({len(phrases)}): {', '.join(phrases)}")
+        print("Stage 2 path: rewrite_phrases (from Stage 1 output)")
+        if stage2_from_phrases is None:
+            raise RuntimeError("psq_candidates_from_rewrite_phrases is not available in psq_rag.retrieval.psq_retrieval")
+        stage2_out = stage2_from_phrases(rewrite_phrases=phrases, allow_nsfw_tags=args.allow_nsfw)
+    candidates, aux = _as_list_candidates(stage2_out)
+    if not isinstance(candidates, list):
+        candidates = list(candidates)
+    if isinstance(stage2_out, tuple):
+        print(f"Stage 2 return type: tuple len={len(stage2_out)}")
+    elif isinstance(stage2_out, list):
+        print(f"Stage 2 return type: list len={len(stage2_out)}")
+    elif isinstance(stage2_out, dict):
+        print(f"Stage 2 return type: dict keys={sorted(stage2_out.keys())}")
+    else:
+        print(f"Stage 2 return type: {type(stage2_out).__name__}")
+    print(f"Stage 2 candidates type: {type(candidates).__name__} len={len(candidates)}")
+    print(f"Stage 2 returned {len(candidates)} candidates")
+    if args.max_cands and args.max_cands > 0:
+        candidates = candidates[: args.max_cands]
+        print(f"Truncated to {len(candidates)} candidates due to --max-cands")
+    _describe_candidate_sample(candidates, n=5)
+    num_candidates_with_sources = sum(
+        1
+        for c in candidates
+        if hasattr(c, "sources") and bool(getattr(c, "sources", []))
+    )
+    distinct_sources = len(
+        {
+            src
+            for c in candidates
+            if hasattr(c, "sources")
+            for src in getattr(c, "sources", [])
+        }
+    )
+    print(
+        "Stage 2 sources: "
+        f"with_sources={num_candidates_with_sources} "
+        f"distinct_sources={distinct_sources}"
+    )
+    if args.show_top and args.show_top > 0:
+        _print_top_candidates(candidates, args.show_top)
+    # Stage 3
+    llm_select_indices, split_candidates_by_type = _import_stage3_selector()
+    # Show candidate bucket assignments (general vs entity) if candidates are Candidate objects
+    if candidates and hasattr(candidates[0], 'tag'):
+        from psq_rag.retrieval.psq_retrieval import Candidate
+        print("\nCandidate type split (general vs entity):")
+        general_with_idx, entity_with_idx = split_candidates_by_type(cast(List[Candidate], candidates), log=None)
+        print(f"  General candidates (attributes, species, meta, artists): {len(general_with_idx)}")
+        print(f"  Entity candidates (characters only, copyrights filtered): {len(entity_with_idx)}")
+        if entity_with_idx:
+            print(f"\n  Character candidates preview (first {min(10, len(entity_with_idx))}):")
+            for _, cand in entity_with_idx[:10]:
+                print(f"    - {_safe_tag_display(cand.tag)}")
+    else:
+        print("\nSkipping candidate type split (candidates not in Candidate format)")
+    def log(msg: str) -> None:
+        print(msg)
+    print("\nRunning Stage 3 (closed-set selection)...")
+    # NOTE: llm_select_indices returns indices into the ORIGINAL candidates list you pass in.
+    picked = llm_select_indices(
+        query_text=prompt,  # treated as image description in Stage 3
+        candidates=candidates,
+        max_pick=0,
+        log=log,
+        mode=args.mode,
+        chunk_size=args.chunk_size,
+        per_phrase_k=args.per_phrase_k,
+        temperature=args.temperature,
+        max_tokens=args.max_tokens,
+    )
+    print("\nStage 3 selected:")
+    if not picked:
+        print("  (no selections)")
+        return 0
+    # Deduplicate while preserving order
+    seen = set()
+    tags = []
+    for idx in picked:
+        if idx in seen:
+            continue
+        seen.add(idx)
+        c = candidates[idx]
+        tags.append(getattr(c, "tag", str(c)))
+    for t in tags:
+        print(f"  - {t}  ({_safe_tag_display(t)})")
+    print(f"\nTotal selected tags: {len(tags)}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

scripts/test_alias_filter.py ADDED Viewed

	@@ -0,0 +1,304 @@

+"""Test harness for Stage 3 alias-based character tag filtering.
+Tests _character_matches_via_aliases() and related helper functions to ensure:
+- Character tags only match when the user mentions the character name (or alias)
+- Variant tags (e.g. pikachu_libre) do NOT match when only the base name is used
+- Aliases with series suffixes (e.g. tails_(sonic)) correctly match after normalization
+- Fuzzy matching handles common typos
+- Generic descriptions (e.g. "orange cat") do NOT match character tags
+Usage:
+    python scripts/test_alias_filter.py
+Requires: rapidfuzz (no CSV data files needed - uses mock alias data)
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+# Ensure repo root is on sys.path
+_REPO_ROOT = Path(__file__).resolve().parents[1]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+from psq_rag.llm.select import (
+    _normalize_for_matching,
+    _query_words,
+    _alias_matches_query,
+    _character_matches_via_aliases,
+)
+# ---------------------------------------------------------------------------
+# Mock alias data matching real e621 patterns
+# ---------------------------------------------------------------------------
+MOCK_TAG2ALIASES = {
+    # Garfield: "garfield" is an alias for garfield_the_cat
+    "garfield_the_cat": ["garfield", "garfield_(character)", "garfield_cat"],
+    # Tails / Miles Prower: aliases include tails_(sonic)
+    "miles_prower": ["tails_(sonic)", "tails_the_fox", "tailsko", "miles_tails_prower"],
+    # Pikachu base (species tag type 5, but testing if it were type 4)
+    "pikachu": ["pikachu_(pokemon)"],
+    # Pikachu variants - distinct aliases that should NOT match base "pikachu"
+    "pikachu_libre": ["pikachu_libre_(pokemon)", "libre_pikachu"],
+    "detective_pikachu": ["detective_pikachu_(pokemon)", "detective_pikachu_(movie)"],
+    "cosplay_pikachu_(character)": ["cosplay_pikachu"],
+    # Sonic
+    "sonic_the_hedgehog": ["sonic", "sonic_(character)", "sonic_(sth)"],
+    # Character with no aliases
+    "cat_busters": [],
+    # Mickey Mouse
+    "mickey_mouse": ["mickey", "mickey_(disney)"],
+    # A character whose name is a common word
+    "shadow_the_hedgehog": ["shadow_(sonic)", "shadow"],
+}
+def log(msg: str) -> None:
+    print(f"  {msg}")
+def run_tests() -> int:
+    passed = 0
+    failed = 0
+    def check(description: str, result: bool, expected: bool) -> None:
+        nonlocal passed, failed
+        status = "PASS" if result == expected else "FAIL"
+        if result != expected:
+            failed += 1
+            print(f"  {status}: {description} (got={result}, expected={expected})")
+        else:
+            passed += 1
+            print(f"  {status}: {description}")
+    # -----------------------------------------------------------------------
+    print("\n=== _normalize_for_matching ===")
+    # -----------------------------------------------------------------------
+    check(
+        "strips series suffix _(sonic)",
+        _normalize_for_matching("tails_(sonic)") == "tails",
+        True,
+    )
+    check(
+        "strips _(character) suffix",
+        _normalize_for_matching("garfield_(character)") == "garfield",
+        True,
+    )
+    check(
+        "replaces underscores with spaces",
+        _normalize_for_matching("garfield_the_cat") == "garfield the cat",
+        True,
+    )
+    check(
+        "lowercases",
+        _normalize_for_matching("Pikachu_Libre") == "pikachu libre",
+        True,
+    )
+    check(
+        "no suffix stays intact",
+        _normalize_for_matching("pikachu_libre") == "pikachu libre",
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Core matching: tails vs miles_prower ===")
+    # -----------------------------------------------------------------------
+    query = "tails flying"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'tails flying' matches miles_prower (via alias tails_(sonic))",
+        _character_matches_via_aliases("miles_prower", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Core matching: pikachu vs variants ===")
+    # -----------------------------------------------------------------------
+    query = "pikachu with red cheeks"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'pikachu with red cheeks' matches pikachu (base tag)",
+        _character_matches_via_aliases("pikachu", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    check(
+        "'pikachu with red cheeks' does NOT match pikachu_libre",
+        _character_matches_via_aliases("pikachu_libre", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    check(
+        "'pikachu with red cheeks' does NOT match detective_pikachu",
+        _character_matches_via_aliases("detective_pikachu", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    check(
+        "'pikachu with red cheeks' does NOT match cosplay_pikachu_(character)",
+        _character_matches_via_aliases("cosplay_pikachu_(character)", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Variant explicitly mentioned ===")
+    # -----------------------------------------------------------------------
+    query = "pikachu libre wrestling"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'pikachu libre wrestling' matches pikachu_libre",
+        _character_matches_via_aliases("pikachu_libre", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    check(
+        "'pikachu libre wrestling' also matches base pikachu (substring)",
+        _character_matches_via_aliases("pikachu", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    query = "detective pikachu in the rain"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'detective pikachu in the rain' matches detective_pikachu",
+        _character_matches_via_aliases("detective_pikachu", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Garfield via alias ===")
+    # -----------------------------------------------------------------------
+    query = "garfield sleeping on a table"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'garfield sleeping' matches garfield_the_cat (via alias 'garfield')",
+        _character_matches_via_aliases("garfield_the_cat", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Generic description should NOT match characters ===")
+    # -----------------------------------------------------------------------
+    query = "orange cat sitting outside"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'orange cat sitting outside' does NOT match garfield_the_cat",
+        _character_matches_via_aliases("garfield_the_cat", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    check(
+        "'orange cat sitting outside' does NOT match cat_busters",
+        _character_matches_via_aliases("cat_busters", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    query = "mouse character running"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'mouse character running' does NOT match mickey_mouse",
+        _character_matches_via_aliases("mickey_mouse", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Sonic via alias ===")
+    # -----------------------------------------------------------------------
+    query = "sonic running fast"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'sonic running fast' matches sonic_the_hedgehog (via alias 'sonic')",
+        _character_matches_via_aliases("sonic_the_hedgehog", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Fuzzy matching: typos ===")
+    # -----------------------------------------------------------------------
+    query = "garfeild sleeping"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'garfeild' (typo) matches garfield_the_cat via fuzzy",
+        _character_matches_via_aliases("garfield_the_cat", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    query = "pikachuu battling"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'pikachuu' (typo) matches pikachu via fuzzy",
+        _character_matches_via_aliases("pikachu", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Shadow: common word that is also a character alias ===")
+    # -----------------------------------------------------------------------
+    query = "shadow the hedgehog posing"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'shadow the hedgehog posing' matches shadow_the_hedgehog",
+        _character_matches_via_aliases("shadow_the_hedgehog", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # "shadow" alone is an alias - this WILL match because the user said "shadow"
+    query = "shadow lurking in darkness"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'shadow lurking in darkness' matches shadow_the_hedgehog (alias 'shadow')",
+        _character_matches_via_aliases("shadow_the_hedgehog", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        True,
+    )
+    # -----------------------------------------------------------------------
+    print("\n=== Tag with no aliases and no name match ===")
+    # -----------------------------------------------------------------------
+    query = "a dog playing fetch"
+    qwords = _query_words(query)
+    qnorm = _normalize_for_matching(query)
+    check(
+        "'a dog playing fetch' does NOT match cat_busters (no aliases, no name match)",
+        _character_matches_via_aliases("cat_busters", query, MOCK_TAG2ALIASES, qwords, qnorm),
+        False,
+    )
+    # -----------------------------------------------------------------------
+    # Summary
+    # -----------------------------------------------------------------------
+    total = passed + failed
+    print(f"\n{'=' * 50}")
+    print(f"Results: {passed}/{total} passed, {failed}/{total} failed")
+    if failed == 0:
+        print("ALL TESTS PASSED")
+    else:
+        print("SOME TESTS FAILED")
+    print(f"{'=' * 50}")
+    return 1 if failed > 0 else 0
+if __name__ == "__main__":
+    sys.exit(run_tests())

transparentsquirrel.png ADDED Viewed

Git LFS Details

SHA256: 090a20f6afc0879333afb01ee491df994ed549c543aac861d76ab1fa05978a90
Pointer size: 131 Bytes
Size of remote file: 257 kB

wiki_pages-2023-08-08.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d453c0cc8ae09c548e554ceb77b1c1578c277eb2c5a6278a85f89c73566a7b27
+size 30986436

word_rating_probabilities.csv ADDED Viewed

The diff for this file is too large to render. See raw diff