File size: 4,640 Bytes
3be54c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python3
"""
BM25 retrieval quality probe — does the retriever find the RIGHT doc
in top-K for each diagnostic query? This determines whether a reranker
can possibly succeed (reranker is only as good as its candidate pool).

For each test query: ask the API for top-30 docs, then check whether
the doc containing the expected anchor term shows up in top-1 / top-3 /
top-10 / top-30. The anchor is the ground-truth section/concept that
the teacher answer cites.
"""
import json
import re
import urllib.request
from pathlib import Path

API = "http://127.0.0.1:8000/v1/query"

# (query, anchor_regex) — anchor must appear in the retrieved doc text
# for that doc to count as "the right one". Section regexes look for
# explicit "[KL§N]" markers + "סעיף N" + the section's hallmark term.
TESTS = [
    ("מה אומר סעיף 39 לחוק החוזים?",   r"(KL.?39|סעיף\s*39|תום\s*לב)"),
    ("מה אומר סעיף 12 לחוק החוזים?",   r"(KL.?12|סעיף\s*12|תום\s*לב\s*במשא)"),
    ("מה אומר סעיף 30 לחוק החוזים?",   r"(KL.?30|סעיף\s*30|חוזה\s*פסול|תקנת\s*הציבור)"),
    ("מה זה גמירת דעת?",                r"גמירת\s*דעת"),
    ("מה זה מסויימות?",                 r"מסוימות|מסויימות"),
    ("הסבר על קיום בתום לב",            r"(KL.?39|סעיף\s*39|תום\s*לב|קיום)"),
    ("מה התרופות במכר פגום?",            r"(תרופות|אכיפה|מכר|פגם)"),
    ("מתי קונה רשאי לבטל מכר?",          r"(ביטול|מכר|הפרה\s*יסודית|ארכה)"),
    ("מה ההבדל בין שכירות למכירה?",      r"(שכירות|מכר|חוק\s*המכר|חוק\s*השכירות)"),
    ("שנשר25512551",                     r"(NOMATCH_GIBBERISH)"),  # no anchor — can't pass
    ("הסבר את חובת תום הלב בחוזים",       r"(KL.?39|סעיף\s*39|תום\s*לב)"),
    ("איך מתבטל חוזה פסול?",             r"(KL.?30|סעיף\s*30|חוזה\s*פסול|תקנת)"),
]


def call(query, top_k=30):
    body = json.dumps({"query": query, "top_k": top_k},
                      ensure_ascii=False).encode("utf-8")
    req = urllib.request.Request(
        API, data=body,
        headers={"Content-Type": "application/json; charset=utf-8"},
    )
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode("utf-8"))


def hit_rank(docs, anchor_re):
    """Return 1-indexed rank of first doc whose text matches anchor_re,
    or 0 if none of `docs` matches."""
    for i, d in enumerate(docs, 1):
        text = d.get("text") or ""
        if re.search(anchor_re, text):
            return i
    return 0


def main():
    print("=" * 78)
    print(" BM25 QUALITY PROBE — does retrieval surface the right doc?")
    print("=" * 78)

    rows = []
    for q, anchor in TESTS:
        try:
            r = call(q, top_k=30)
        except Exception as e:
            print(f"\n❌ {q}\n   API error: {e}")
            continue
        docs = r.get("docs") or []
        rank = hit_rank(docs, anchor)
        rows.append({"q": q, "rank": rank, "n_returned": len(docs)})
        marker = "✓" if rank > 0 else "✗"
        print(f"{marker}  rank={rank or '—':>3}  /{len(docs):>2}  {q}")

    n = len(rows)
    n_top1  = sum(1 for r in rows if 1 <= r["rank"] <= 1)
    n_top3  = sum(1 for r in rows if 1 <= r["rank"] <= 3)
    n_top10 = sum(1 for r in rows if 1 <= r["rank"] <= 10)
    n_top30 = sum(1 for r in rows if r["rank"] >= 1)
    n_miss  = sum(1 for r in rows if r["rank"] == 0)

    print()
    print("=" * 78)
    print(" AGGREGATE")
    print("=" * 78)
    print(f"  total queries:      {n}")
    print(f"  hit @ top-1:        {n_top1}/{n} ({100*n_top1/n:.0f}%)")
    print(f"  hit @ top-3:        {n_top3}/{n} ({100*n_top3/n:.0f}%)")
    print(f"  hit @ top-10:       {n_top10}/{n} ({100*n_top10/n:.0f}%)")
    print(f"  hit @ top-30:       {n_top30}/{n} ({100*n_top30/n:.0f}%)")
    print(f"  missed entirely:    {n_miss}/{n}")
    print()
    print("  VERDICT for reranker viability:")
    if n_top10 >= n - 2:    # tolerate 2 misses (e.g. gibberish + 1 hard)
        print("    ✅ STRONG — reranker on top-30 will likely succeed.")
    elif n_top30 >= n * 0.7:
        print("    ⚠️  MIXED — top-30 mostly covers but BM25 needs tuning")
        print("       OR add dense retriever before reranker.")
    else:
        print("    ❌ WEAK — reranker can't fix retrieval gap. Fix BM25 first.")


if __name__ == "__main__":
    main()