| |
| """ |
| BM25 retrieval quality probe — does the retriever find the RIGHT doc |
| in top-K for each diagnostic query? This determines whether a reranker |
| can possibly succeed (reranker is only as good as its candidate pool). |
| |
| For each test query: ask the API for top-30 docs, then check whether |
| the doc containing the expected anchor term shows up in top-1 / top-3 / |
| top-10 / top-30. The anchor is the ground-truth section/concept that |
| the teacher answer cites. |
| """ |
| import json |
| import re |
| import urllib.request |
| from pathlib import Path |
|
|
| API = "http://127.0.0.1:8000/v1/query" |
|
|
| |
| |
| |
| TESTS = [ |
| ("מה אומר סעיף 39 לחוק החוזים?", r"(KL.?39|סעיף\s*39|תום\s*לב)"), |
| ("מה אומר סעיף 12 לחוק החוזים?", r"(KL.?12|סעיף\s*12|תום\s*לב\s*במשא)"), |
| ("מה אומר סעיף 30 לחוק החוזים?", r"(KL.?30|סעיף\s*30|חוזה\s*פסול|תקנת\s*הציבור)"), |
| ("מה זה גמירת דעת?", r"גמירת\s*דעת"), |
| ("מה זה מסויימות?", r"מסוימות|מסויימות"), |
| ("הסבר על קיום בתום לב", r"(KL.?39|סעיף\s*39|תום\s*לב|קיום)"), |
| ("מה התרופות במכר פגום?", r"(תרופות|אכיפה|מכר|פגם)"), |
| ("מתי קונה רשאי לבטל מכר?", r"(ביטול|מכר|הפרה\s*יסודית|ארכה)"), |
| ("מה ההבדל בין שכירות למכירה?", r"(שכירות|מכר|חוק\s*המכר|חוק\s*השכירות)"), |
| ("שנשר25512551", r"(NOMATCH_GIBBERISH)"), |
| ("הסבר את חובת תום הלב בחוזים", r"(KL.?39|סעיף\s*39|תום\s*לב)"), |
| ("איך מתבטל חוזה פסול?", r"(KL.?30|סעיף\s*30|חוזה\s*פסול|תקנת)"), |
| ] |
|
|
|
|
| def call(query, top_k=30): |
| body = json.dumps({"query": query, "top_k": top_k}, |
| ensure_ascii=False).encode("utf-8") |
| req = urllib.request.Request( |
| API, data=body, |
| headers={"Content-Type": "application/json; charset=utf-8"}, |
| ) |
| with urllib.request.urlopen(req, timeout=30) as resp: |
| return json.loads(resp.read().decode("utf-8")) |
|
|
|
|
| def hit_rank(docs, anchor_re): |
| """Return 1-indexed rank of first doc whose text matches anchor_re, |
| or 0 if none of `docs` matches.""" |
| for i, d in enumerate(docs, 1): |
| text = d.get("text") or "" |
| if re.search(anchor_re, text): |
| return i |
| return 0 |
|
|
|
|
| def main(): |
| print("=" * 78) |
| print(" BM25 QUALITY PROBE — does retrieval surface the right doc?") |
| print("=" * 78) |
|
|
| rows = [] |
| for q, anchor in TESTS: |
| try: |
| r = call(q, top_k=30) |
| except Exception as e: |
| print(f"\n❌ {q}\n API error: {e}") |
| continue |
| docs = r.get("docs") or [] |
| rank = hit_rank(docs, anchor) |
| rows.append({"q": q, "rank": rank, "n_returned": len(docs)}) |
| marker = "✓" if rank > 0 else "✗" |
| print(f"{marker} rank={rank or '—':>3} /{len(docs):>2} {q}") |
|
|
| n = len(rows) |
| n_top1 = sum(1 for r in rows if 1 <= r["rank"] <= 1) |
| n_top3 = sum(1 for r in rows if 1 <= r["rank"] <= 3) |
| n_top10 = sum(1 for r in rows if 1 <= r["rank"] <= 10) |
| n_top30 = sum(1 for r in rows if r["rank"] >= 1) |
| n_miss = sum(1 for r in rows if r["rank"] == 0) |
|
|
| print() |
| print("=" * 78) |
| print(" AGGREGATE") |
| print("=" * 78) |
| print(f" total queries: {n}") |
| print(f" hit @ top-1: {n_top1}/{n} ({100*n_top1/n:.0f}%)") |
| print(f" hit @ top-3: {n_top3}/{n} ({100*n_top3/n:.0f}%)") |
| print(f" hit @ top-10: {n_top10}/{n} ({100*n_top10/n:.0f}%)") |
| print(f" hit @ top-30: {n_top30}/{n} ({100*n_top30/n:.0f}%)") |
| print(f" missed entirely: {n_miss}/{n}") |
| print() |
| print(" VERDICT for reranker viability:") |
| if n_top10 >= n - 2: |
| print(" ✅ STRONG — reranker on top-30 will likely succeed.") |
| elif n_top30 >= n * 0.7: |
| print(" ⚠️ MIXED — top-30 mostly covers but BM25 needs tuning") |
| print(" OR add dense retriever before reranker.") |
| else: |
| print(" ❌ WEAK — reranker can't fix retrieval gap. Fix BM25 first.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|