File size: 3,889 Bytes
eb83689
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""LangSmith evaluation for the NoteGuard agent slice.

Two evaluators that map straight onto the judging story:
  1. zero_phi_to_model  - the hard privacy guarantee (must be 1.0)
  2. faithfulness       - LLM-as-judge: is every claim supported by the note?

Run:  python -m eval.run_eval
Needs: LANGSMITH_API_KEY, GOOGLE_API_KEY, TAVILY_API_KEY (+ LANGSMITH_TRACING=true)

API note: the LangSmith evaluate surface has shifted across versions. This targets
langsmith>=0.1 with dict-style evaluators (inputs/outputs). Adjust signatures if
your installed version differs.
"""

from __future__ import annotations

from dotenv import load_dotenv

load_dotenv()

from langchain.chat_models import init_chat_model
from langchain_core.messages import HumanMessage
from langsmith import Client

from agent.graph import build_graph
from src.deid import NoteGuard

KNOWN = {"PERSON": ["Margaret Okafor"], "NHS": ["485 777 3456"]}
EXAMPLES = [
    {
        "note": (
            "Ward 4B. Pt Margaret Okafor (NHS 485 777 3456, DOB 22/09/1958, F, 45 Elm Road SW1A 1AA). "
            "GP: Dr James Obi, Riverside Surgery, Lambeth SE1 7PB. "
            "Admitted 12 Jan 2025 via ED with acute exacerbation of COPD. "
            "PMH: COPD (GOLD III), T2DM on metformin, hypertension on amlodipine. NKDA. "
            "O2 sats 88% on air. Managed with nebulised salbutamol, ipratropium, IV hydrocortisone, "
            "doxycycline. CXR: bilateral hyperinflation, no consolidation. WBC 11.2, CRP 78. "
            "Discharged 14 Jan 2025. TTO: carbocisteine 375 mg TDS, prednisolone 30 mg OD 5/7, "
            "doxycycline 100 mg OD 4/7. Metformin and amlodipine continued. "
            "Consultant: Dr Sarah Chen, Respiratory Medicine."
        ),
        "question": "Draft an NHS eDischarge summary.",
    },
]

client = Client()
_judge = None


def _content_str(content) -> str:
    """Flatten AIMessage content — Gemini returns a list of blocks, not a plain string."""
    if isinstance(content, list):
        return " ".join(b.get("text", "") if isinstance(b, dict) else str(b) for b in content)
    return content or ""


def target(inputs: dict) -> dict:
    graph = build_graph(known=KNOWN)
    state = graph.invoke(
        {"messages": [HumanMessage(content=inputs["note"] + "\n\n" + inputs["question"])]},
    )
    model_facing = " ".join(_content_str(getattr(m, "content", "")) for m in state["messages"])
    return {"clinician_answer": state.get("clinician_answer", ""), "model_facing": model_facing}


def zero_phi_to_model(inputs: dict, outputs: dict) -> dict:
    hits = NoteGuard(known=KNOWN).residual_identifiers(outputs["model_facing"])
    return {"key": "zero_phi_to_model", "score": 1.0 if not hits else 0.0}


def faithfulness(inputs: dict, outputs: dict) -> dict:
    global _judge
    _judge = _judge or init_chat_model("google_genai:gemini-2.5-flash")
    prompt = (
        f"NOTE:\n{inputs['note']}\n\nSUMMARY:\n{outputs['clinician_answer']}\n\n"
        "Is every clinical claim in SUMMARY supported by NOTE? "
        "Reply with a single number between 0 and 1."
    )
    raw = _content_str(_judge.invoke(prompt).content)
    try:
        score = max(0.0, min(1.0, float(raw.strip().split()[0])))
    except (ValueError, IndexError):
        score = 0.0
    return {"key": "faithfulness", "score": score}


if __name__ == "__main__":
    dataset_name = "noteguard-discharge-eval"
    try:
        dataset = client.create_dataset(dataset_name)
        client.create_examples(
            dataset_id=dataset.id,
            inputs=[{"note": e["note"], "question": e["question"]} for e in EXAMPLES],
        )
    except Exception:
        pass  # dataset already exists from a previous run

    client.evaluate(
        target,
        data=dataset_name,
        evaluators=[zero_phi_to_model, faithfulness],
        experiment_prefix="noteguard-slice",
    )