Spaces:

FoodDesert
/

Prompt_Squirrel_RAG

Running

Prompt_Squirrel_RAG / scripts /analyze_probe_informativeness.py

Food Desert

Consolidate probe configs and eval artifacts on main

6e50f4d 3 months ago

17.9 kB

	"""Rank candidate probe tags by informativeness before any LLM queries.

	This is an offline metric pass combining:
	- entropy / information gain from sample co-occurrence,
	- lift against active groups/categories,
	- reduced TF-IDF semantic focus against group centroids.

	Compact outputs (overwrite in place):
	- data/analysis/probe_informativeness.csv
	- data/analysis/probe_informativeness_summary.json
	"""
	from __future__ import annotations

	import csv
	import json
	import math
	from collections import Counter
	from pathlib import Path
	from typing import Dict, List, Set, Tuple

	import numpy as np

	from psq_rag.retrieval.state import get_tfidf_tag_vectors


	REPO = Path(__file__).resolve().parents[1]
	COUNTS_CSV = REPO / "fluffyrock_3m.csv"
	SAMPLE_JSONL = REPO / "data" / "eval_samples" / "e621_sfw_sample_1000_seed123_buffer10000.jsonl"
	WIKI_GROUPS_JSON = REPO / "data" / "tag_groups.json"
	REGISTRY_CSV = REPO / "data" / "category_registry.csv"
	CATEGORY_TAG_GROUP_MAP_CSV = REPO / "data" / "analysis" / "category_tag_group_map.csv"

	OUT_CSV = REPO / "data" / "analysis" / "probe_informativeness.csv"
	OUT_SUMMARY = REPO / "data" / "analysis" / "probe_informativeness_summary.json"

	MIN_COUNT = 200
	MIN_PROBE_IMAGES = 5
	MIN_GROUP_IMAGES = 20
	SOFTMAX_TAU = 0.15
	MMR_LAMBDA = 0.35
	MMR_TOP_POOL = 120
	MMR_K = 15

	DOMAIN_JARGON = {
	"solo", "duo", "trio", "anthro", "feral", "gynomorph", "andromorph", "maleherm",
	"topwear", "bottomwear", "legwear", "handwear", "headwear", "footwear",
	"leporid", "canid", "canis", "felid", "felis", "equid", "haplorhine",
	"zero_pictured", "male/female", "male/male", "female/female",
	}


	def load_counts(path: Path) -> Dict[str, int]:
	out: Dict[str, int] = {}
	with path.open("r", encoding="utf-8", newline="") as f:
	reader = csv.reader(f)
	for row in reader:
	if len(row) < 3:
	continue
	try:
	out[row[0]] = int(row[2]) if row[2] else 0
	except ValueError:
	out[row[0]] = 0
	return out


	def load_image_tags(path: Path, counts: Dict[str, int], min_count: int) -> List[Set[str]]:
	rows: List[Set[str]] = []
	with path.open("r", encoding="utf-8") as f:
	for line in f:
	obj = json.loads(line)
	raw = obj.get("tags_ground_truth_categorized", "")
	if not raw:
	continue
	try:
	d = json.loads(raw)
	except Exception:
	continue
	tags: Set[str] = set()
	if isinstance(d, dict):
	for vals in d.values():
	if isinstance(vals, list):
	for t in vals:
	if isinstance(t, str) and counts.get(t, 0) >= min_count:
	tags.add(t)
	if tags:
	rows.append(tags)
	return rows


	def load_excluded_wiki_groups_from_policy(path: Path) -> Set[str]:
	"""Read excluded wiki groups from the tag-group map file.

	Convention:
	- rows with enabled=1 and category_name starting with 'ignored_'
	- tag_group column contains the wiki group name to exclude.
	"""
	excluded: Set[str] = set()
	if not path.is_file():
	return excluded
	with path.open("r", encoding="utf-8", newline="") as f:
	reader = csv.DictReader(f)
	for row in reader:
	if (row.get("enabled") or "").strip() not in {"1", "true", "True"}:
	continue
	category = (row.get("category_name") or "").strip().lower()
	group = (row.get("tag_group") or "").strip()
	if category.startswith("ignored_") and group:
	excluded.add(group)
	return excluded


	def load_groups() -> Tuple[Dict[str, Set[str]], Set[str]]:
	groups: Dict[str, Set[str]] = {}
	excluded_wiki_groups = load_excluded_wiki_groups_from_policy(CATEGORY_TAG_GROUP_MAP_CSV)

	with WIKI_GROUPS_JSON.open("r", encoding="utf-8") as f:
	wiki = json.load(f)
	for g, tags in wiki.items():
	if g in excluded_wiki_groups:
	continue
	if isinstance(tags, list):
	groups[f"wiki:{g}"] = {t for t in tags if isinstance(t, str) and t}

	with REGISTRY_CSV.open("r", encoding="utf-8", newline="") as f:
	reader = csv.DictReader(f)
	for row in reader:
	if (row.get("category_enabled") or "").strip() not in {"1", "true", "True"}:
	continue
	c = (row.get("category_name") or "").strip()
	t = (row.get("tag") or "").strip()
	if c and t:
	groups.setdefault(f"cat:{c}", set()).add(t)

	return groups, excluded_wiki_groups


	def needs_glossary(tag: str) -> bool:
	if tag in DOMAIN_JARGON:
	return True
	if "/" in tag or "(" in tag or ")" in tag:
	return True
	if any(ch.isdigit() for ch in tag):
	return True
	# Taxonomy-ish suffixes often need disambiguation in prompts.
	if tag.endswith("id") or tag.endswith("ine"):
	return True
	return False


	def infer_probe_bundle(tag: str, semantic_top_group: str, strongest_group: str) -> str:
	t = tag
	g = f"{semantic_top_group} {strongest_group}".lower()
	if t in {"solo", "duo", "trio", "group", "zero_pictured"}:
	return "count_cardinality"
	if t in {"anthro", "feral", "humanoid", "biped", "quadruped"}:
	return "body_type_presence"
	if t in {"clothed", "clothing", "topless", "bottomless", "nude", "barefoot", "topwear", "bottomwear"}:
	return "clothing_state"
	if any(x in t for x in ["canid", "canis", "felid", "felis", "equid", "leporid", "species", "mammal", "bird", "bear", "unicorn", "reptile", "dragon"]):
	return "species_taxonomy"
	if any(x in t for x in ["breast", "thigh", "hips", "curvy", "muscular", "overweight", "chubby", "butt"]):
	return "body_shape_breasts"
	if any(x in t for x in ["look", "gaze", "eyes", "smile", "blush", "open_mouth", "eyes_closed"]):
	return "gaze_expression"
	if t in {"text", "dialogue", "<3"} or any(x in t for x in ["text", "dialogue", "logo", "symbol"]):
	return "text_symbols"
	if any(x in t for x in ["background", "outside", "inside", "indoors", "outdoors", "standing", "sitting"]):
	return "scene_pose"
	if "cat:clothing" in g or "wiki:clothes" in g:
	return "clothing_state"
	if "cat:count" in g:
	return "count_cardinality"
	return "other"


	def entropy_binary(p: float) -> float:
	p = min(max(p, 1e-12), 1 - 1e-12)
	return -(p * math.log2(p) + (1 - p) * math.log2(1 - p))


	def softmax(x: np.ndarray, tau: float) -> np.ndarray:
	z = x / max(tau, 1e-6)
	z = z - np.max(z)
	e = np.exp(z)
	return e / max(np.sum(e), 1e-12)


	def binary_mi(a_idx: Set[int], b_idx: Set[int], n: int) -> float:
	# MI for Bernoulli variables in bits.
	n11 = len(a_idx & b_idx)
	n10 = len(a_idx - b_idx)
	n01 = len(b_idx - a_idx)
	n00 = n - n11 - n10 - n01
	probs = {
	(1, 1): n11 / n,
	(1, 0): n10 / n,
	(0, 1): n01 / n,
	(0, 0): n00 / n,
	}
	pa = (n11 + n10) / n
	pb = (n11 + n01) / n
	mi = 0.0
	for (a, b), p in probs.items():
	if p <= 0:
	continue
	qa = pa if a == 1 else (1 - pa)
	qb = pb if b == 1 else (1 - pb)
	mi += p * math.log2(p / max(qa * qb, 1e-12))
	return max(mi, 0.0)


	def main() -> None:
	counts = load_counts(COUNTS_CSV)
	image_tags = load_image_tags(SAMPLE_JSONL, counts, MIN_COUNT)
	n_images = len(image_tags)
	if n_images == 0:
	raise RuntimeError("No image tags loaded.")

	groups_all, excluded_wiki_groups = load_groups()

	probe_to_images: Dict[str, Set[int]] = {}
	tag_occ = Counter()
	for i, tags in enumerate(image_tags):
	for t in tags:
	tag_occ[t] += 1
	probe_to_images.setdefault(t, set()).add(i)

	group_to_images: Dict[str, Set[int]] = {}
	for g, members in groups_all.items():
	idxs: Set[int] = set()
	for i, tags in enumerate(image_tags):
	if tags & members:
	idxs.add(i)
	if len(idxs) >= MIN_GROUP_IMAGES:
	group_to_images[g] = idxs

	active_groups = sorted(group_to_images.keys())
	if not active_groups:
	raise RuntimeError("No active groups after MIN_GROUP_IMAGES filter.")

	# Semantic centroids for active groups.
	vec = get_tfidf_tag_vectors()
	mat = vec["reduced_matrix_norm"]
	tag_to_row = vec["tag_to_row_index"]

	group_centroids: Dict[str, np.ndarray] = {}
	for g in active_groups:
	rows = [tag_to_row[t] for t in groups_all[g] if t in tag_to_row]
	if len(rows) < 2:
	continue
	c = mat[rows].mean(axis=0)
	n = np.linalg.norm(c)
	if n > 0:
	group_centroids[g] = c / n

	semantic_groups = sorted(group_centroids.keys())
	C = np.stack([group_centroids[g] for g in semantic_groups], axis=0) if semantic_groups else None

	baseline_group_probs = {g: len(group_to_images[g]) / n_images for g in active_groups}
	baseline_top5_mass = sum(sorted(baseline_group_probs.values(), reverse=True)[:5])

	rows_out: List[Dict[str, str]] = []
	probe_scores: Dict[str, float] = {}

	for p, p_idxs in probe_to_images.items():
	if len(p_idxs) < MIN_PROBE_IMAGES:
	continue
	q = len(p_idxs) / n_images
	if q <= 0.0 or q >= 1.0:
	continue

	ig_sum = 0.0
	ig_vals = []
	mean_abs_log_lift = 0.0
	lifts: Dict[str, float] = {}
	p1_group_probs: Dict[str, float] = {}

	for g in active_groups:
	g_idxs = group_to_images[g]
	pg = len(g_idxs) / n_images
	pg1 = len(p_idxs & g_idxs) / len(p_idxs)
	p0 = n_images - len(p_idxs)
	pg0 = len((set(range(n_images)) - p_idxs) & g_idxs) / p0 if p0 > 0 else pg

	ig = entropy_binary(pg) - (q * entropy_binary(pg1) + (1 - q) * entropy_binary(pg0))
	ig = max(ig, 0.0)
	ig_vals.append(ig)
	ig_sum += ig

	lift = (pg1 + 1e-9) / (pg + 1e-9)
	lifts[g] = lift
	p1_group_probs[g] = pg1
	mean_abs_log_lift += abs(math.log2(lift + 1e-12))

	mean_abs_log_lift /= len(active_groups)
	ig_mean = float(np.mean(ig_vals)) if ig_vals else 0.0
	top5_mass_p1 = sum(sorted(p1_group_probs.values(), reverse=True)[:5])
	delta_top5_mass = top5_mass_p1 - baseline_top5_mass

	strongest_group = max(lifts.items(), key=lambda kv: abs(math.log2(kv[1] + 1e-12)))
	strongest_group_name = strongest_group[0]
	strongest_group_lift = strongest_group[1]

	semantic_top_group = ""
	semantic_margin = 0.0
	semantic_entropy_norm = 1.0
	if C is not None and p in tag_to_row:
	sims = C @ mat[tag_to_row[p]]
	order = np.argsort(sims)[::-1]
	i1 = int(order[0])
	i2 = int(order[1]) if len(order) > 1 else i1
	semantic_top_group = semantic_groups[i1]
	semantic_margin = float(sims[i1] - sims[i2])
	probs = softmax(sims, SOFTMAX_TAU)
	h = -float(np.sum(probs * np.log2(np.maximum(probs, 1e-12))))
	semantic_entropy_norm = h / math.log2(len(probs)) if len(probs) > 1 else 0.0

	prevalence_balance = math.sqrt(q * (1 - q))
	focus = max(0.0, 1.0 - semantic_entropy_norm)
	combined_score = ig_sum * prevalence_balance * (0.5 + 0.5 * focus)
	probe_scores[p] = combined_score

	rows_out.append(
	{
	"tag": p,
	"sample_occurrences": str(len(p_idxs)),
	"fluffyrock_count": str(counts.get(p, 0)),
	"prevalence": f"{q:.6f}",
	"ig_sum_bits": f"{ig_sum:.6f}",
	"ig_mean_bits": f"{ig_mean:.6f}",
	"delta_top5_mass": f"{delta_top5_mass:.6f}",
	"mean_abs_log2_lift": f"{mean_abs_log_lift:.6f}",
	"semantic_top_group": semantic_top_group,
	"semantic_margin": f"{semantic_margin:.6f}",
	"semantic_entropy_norm": f"{semantic_entropy_norm:.6f}",
	"strongest_group_by_lift": strongest_group_name,
	"strongest_group_lift": f"{strongest_group_lift:.6f}",
	"suggested_probe_bundle": infer_probe_bundle(p, semantic_top_group, strongest_group_name),
	"needs_glossary": "1" if needs_glossary(p) else "0",
	"combined_score": f"{combined_score:.6f}",
	}
	)

	# Add an actionability score that downweights very common probes and favors
	# probes that noticeably reshape top-group mass.
	for r in rows_out:
	q = float(r["prevalence"])
	ig = float(r["ig_sum_bits"])
	delta_top5 = max(0.0, float(r["delta_top5_mass"]))
	semantic_focus = max(0.0, 1.0 - float(r["semantic_entropy_norm"]))
	prevalence_penalty = max(0.0, 1.0 - abs(2 * q - 1.0))
	actionable_score = ig * prevalence_penalty * delta_top5 * (0.5 + 0.5 * semantic_focus)
	r["actionable_score"] = f"{actionable_score:.6f}"

	rows_out.sort(key=lambda r: float(r["combined_score"]), reverse=True)

	# Diversified shortlist via MMR-like greedy on top pool.
	top_pool = [r["tag"] for r in rows_out[:MMR_TOP_POOL]]
	selected: List[str] = []
	while len(selected) < MMR_K and top_pool:
	best_tag = None
	best_val = -1e9
	for t in top_pool:
	rel = probe_scores.get(t, 0.0)
	if not selected:
	val = rel
	else:
	red = float(np.mean([binary_mi(probe_to_images[t], probe_to_images[s], n_images) for s in selected]))
	val = rel - MMR_LAMBDA * red
	if val > best_val:
	best_val = val
	best_tag = t
	if best_tag is None:
	break
	selected.append(best_tag)
	top_pool.remove(best_tag)

	OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
	with OUT_CSV.open("w", encoding="utf-8", newline="") as f:
	writer = csv.DictWriter(
	f,
	fieldnames=[
	"tag",
	"sample_occurrences",
	"fluffyrock_count",
	"prevalence",
	"ig_sum_bits",
	"ig_mean_bits",
	"delta_top5_mass",
	"mean_abs_log2_lift",
	"semantic_top_group",
	"semantic_margin",
	"semantic_entropy_norm",
	"strongest_group_by_lift",
	"strongest_group_lift",
	"suggested_probe_bundle",
	"needs_glossary",
	"combined_score",
	"actionable_score",
	],
	)
	writer.writeheader()
	writer.writerows(rows_out)

	# Aggregate bundle-level utility using top actionable tags per bundle.
	by_bundle: Dict[str, List[Dict[str, str]]] = {}
	for r in rows_out:
	by_bundle.setdefault(r["suggested_probe_bundle"], []).append(r)
	bundle_scores = []
	for b, items in by_bundle.items():
	items_sorted = sorted(items, key=lambda x: float(x["actionable_score"]), reverse=True)
	top_items = items_sorted[:5]
	score = sum(float(x["actionable_score"]) for x in top_items)
	glossary_rate = sum(1 for x in top_items if x["needs_glossary"] == "1") / len(top_items) if top_items else 0.0
	bundle_scores.append(
	{
	"bundle": b,
	"bundle_score_top5_actionable": round(score, 6),
	"top_tags": [x["tag"] for x in top_items],
	"glossary_rate_top5": round(glossary_rate, 3),
	}
	)
	bundle_scores.sort(key=lambda x: x["bundle_score_top5_actionable"], reverse=True)

	top_actionable = sorted(rows_out, key=lambda r: float(r["actionable_score"]), reverse=True)
	top_mid_prevalence = [
	r for r in top_actionable if 0.03 <= float(r["prevalence"]) <= 0.35
	][:40]

	summary = {
	"config": {
	"min_count": MIN_COUNT,
	"min_probe_images": MIN_PROBE_IMAGES,
	"min_group_images": MIN_GROUP_IMAGES,
	"softmax_tau": SOFTMAX_TAU,
	"mmr_lambda": MMR_LAMBDA,
	"mmr_top_pool": MMR_TOP_POOL,
	"mmr_k": MMR_K,
	},
	"n_images": n_images,
	"n_candidate_probes": len(rows_out),
	"n_active_groups": len(active_groups),
	"excluded_wiki_groups": sorted(excluded_wiki_groups),
	"top_probes_by_combined_score": rows_out[:25],
	"top_probes_by_actionable_score": top_actionable[:25],
	"top_actionable_mid_prevalence_for_manual_review": top_mid_prevalence,
	"bundle_scores": bundle_scores[:20],
	"diversified_probe_shortlist": selected,
	"outputs": {
	"csv": str(OUT_CSV),
	"summary_json": str(OUT_SUMMARY),
	},
	}

	with OUT_SUMMARY.open("w", encoding="utf-8") as f:
	json.dump(summary, f, indent=2, ensure_ascii=False)

	print(f"Images: {n_images}")
	print(f"Active groups: {len(active_groups)}")
	print(f"Candidate probes: {len(rows_out)}")
	print(f"Top probes: {[r['tag'] for r in rows_out[:10]]}")
	print(f"Diversified shortlist: {selected}")
	print(f"Outputs: {OUT_CSV}, {OUT_SUMMARY}")


	if __name__ == "__main__":
	main()