"""clustering.py — anchor-based doctrine clusters over the citation graph.

Level 6 of the hierarchical legal graph. Builds on `citation_network`
to identify "doctrine clusters" — groups of judgments that revolve
around a common leading case (the anchor).

Why anchor-based and not Louvain
─────────────────────────────────
Louvain / Leiden / spectral clustering give mathematically clean
communities but they don't map to how lawyers think. A lawyer thinks
in terms of doctrines: "the rule of פרשת אבוטבול" or "the test from
מגדלי הים התיכון". These doctrines have a natural anchor — the
leading case — and a natural perimeter: the precedents that cite
the anchor (applications), and the precedents the anchor itself
relied on (origins).

So a cluster, for us, is:

    cluster(anchor) := {anchor}
                       ∪ docs that cite anchor          (applications)
                       ∪ docs that anchor cites          (origins)

This is a 1-hop ego-network around each anchor. Anchors are picked
by in-degree on the citation graph — the top-K most-cited docs in
each domain become anchors.

A doc can belong to MULTIPLE clusters (a single judgment often sits
in 2-3 doctrines). That's fine and even desired — it lets us trace
cross-doctrine reasoning.

What we expose
──────────────
    DoctrineCluster — dataclass with anchor + members + stats.
    build_clusters(cn, ...) → list of DoctrineCluster
    cluster_for_query(clusters, retrieved_ids) → ranked clusters
    classify_role(cluster, doc_id) → "anchor"/"application"/"origin"

This is read-only over the existing CitationNetwork — no rebuild,
no extra storage. The clusters are built on demand and cached on
the pipeline.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set, Tuple

from .citation_network import CitationNetwork


# ──────────────────────────────────────────────────────────────────────
# Public dataclass
# ──────────────────────────────────────────────────────────────────────

@dataclass
class DoctrineCluster:
    """A single doctrine cluster.

    Roles within the cluster:
      • anchor          — the leading case (highest in-degree at build time)
      • applications    — docs that cite the anchor (downstream propagation)
      • origins         — docs the anchor cites (upstream foundation)
      • boundary        — docs that overlap with this cluster but are
                          rooted in a neighbouring anchor (computed on demand)
    """
    cluster_id:    str                                  # stable hash of anchor_id
    anchor_id:     str                                  # doc_id of the leading case
    anchor_label:  Optional[str]   = None               # "ע\"א 1234/22" if known
    applications:  List[str]       = field(default_factory=list)
    origins:       List[str]       = field(default_factory=list)
    n_in:          int             = 0                  # incoming citations to anchor
    n_out:         int             = 0                  # outgoing from anchor
    domain:        Optional[str]   = None               # majority domain among members
    statute_refs:  List[str]       = field(default_factory=list)  # most-cited statutes
    extra:         Dict[str, Any]  = field(default_factory=dict)

    @property
    def size(self) -> int:
        return 1 + len(self.applications) + len(self.origins)

    def members(self) -> List[str]:
        """All doc_ids in this cluster, anchor first."""
        seen: Set[str] = set()
        out: List[str] = []
        for did in [self.anchor_id, *self.applications, *self.origins]:
            if did not in seen:
                seen.add(did); out.append(did)
        return out

    def role_of(self, doc_id: str) -> Optional[str]:
        if doc_id == self.anchor_id: return "anchor"
        if doc_id in self.applications: return "application"
        if doc_id in self.origins:      return "origin"
        return None


# ──────────────────────────────────────────────────────────────────────
# Build
# ──────────────────────────────────────────────────────────────────────

def _anchor_in_degree(cn: CitationNetwork) -> Dict[str, int]:
    """Compute in-degree (incoming citations) for every corpus doc.

    A doc's in-degree is the number of OTHER corpus docs that cite it
    via any citation that resolves to this doc. This is the classic
    "influence" signal — leading cases score high.
    """
    deg: Dict[str, int] = {}
    for citation, target_doc in cn.doc_for_citation.items():
        # Each citation that resolves to target_doc contributes the
        # number of distinct citers as in-degree
        if not target_doc:
            continue
        citers = cn.cited_by.get(citation, [])
        deg[target_doc] = deg.get(target_doc, 0) + len(set(citers))
    return deg


def _docs_citing(cn: CitationNetwork, doc_id: str) -> List[str]:
    """Find every corpus doc that cites `doc_id` (via any of its
    resolved citations). Mirror of citation_network.network_for_doc's
    cited_by computation."""
    out: List[str] = []
    seen: Set[str] = {doc_id}
    for citation, target in cn.doc_for_citation.items():
        if target != doc_id:
            continue
        for citer in cn.cited_by.get(citation, []):
            if citer not in seen:
                seen.add(citer); out.append(citer)
    return out


def _docs_cited_by(cn: CitationNetwork, doc_id: str) -> List[str]:
    """The corpus docs that `doc_id` cites — i.e., the resolved
    targets of its outgoing citations."""
    out: List[str] = []
    seen: Set[str] = {doc_id}
    for c in cn.cites.get(doc_id, []):
        target = cn.doc_for_citation.get(c["text"])
        if target and target not in seen:
            seen.add(target); out.append(target)
    return out


def _docs_co_citing(
    cn: CitationNetwork, doc_id: str, *, min_overlap: int = 2,
) -> List[str]:
    """Co-citation expansion — docs that share `min_overlap` citation
    strings with `doc_id`. Useful for small corpora where the standard
    'cite the anchor directly' graph is sparse but docs cluster around
    common external precedents.

    Example: 5 kolzchut articles all cite both 'ע\"א 6276/95 מגדלי הים'
    and 'סעיף 39 לחוק החוזים'. They form a co-citation cluster even if
    none of them cites the OTHERS directly.
    """
    my_cites = {c["text"] for c in cn.cites.get(doc_id, [])}
    if len(my_cites) < min_overlap:
        return []
    counts: Dict[str, int] = {}
    for cite in my_cites:
        for other in cn.cited_by.get(cite, []):
            if other == doc_id:
                continue
            counts[other] = counts.get(other, 0) + 1
    return [d for d, n in counts.items() if n >= min_overlap]


def _hash_id(s: str) -> str:
    """Stable 12-char hash for cluster IDs."""
    import hashlib
    return hashlib.md5(s.encode("utf-8")).hexdigest()[:12]


def _virtual_anchor_clusters(
    cn: CitationNetwork,
    *,
    docs: Optional[List[Any]] = None,
    min_in_degree: int = 3,
    top_n: int = 500,
) -> List[DoctrineCluster]:
    """Fallback for corpora where citations point OUTSIDE the corpus
    (e.g. kolzchut articles citing court rulings the corpus doesn't
    contain).

    Strategy: treat each popular citation string as a "virtual anchor".
    The cluster = all corpus docs that cite that string. No `origins`
    (we don't have the cited doc to walk its outgoing citations).
    Useful for summary-style corpora.
    """
    doc_by_id: Dict[str, Any] = {}
    if docs:
        for d in docs:
            did = getattr(d, "id", None)
            if did: doc_by_id[did] = d

    # Rank citation strings by how many corpus docs cite them
    cite_in_degree = [
        (cit, len(set(citers)))
        for cit, citers in cn.cited_by.items()
        if len(set(citers)) >= min_in_degree
    ]
    cite_in_degree.sort(key=lambda x: -x[1])
    cite_in_degree = cite_in_degree[:top_n]

    clusters: List[DoctrineCluster] = []
    for cit_text, deg in cite_in_degree:
        applications = list(set(cn.cited_by.get(cit_text, [])))
        if not applications:
            continue
        # Domain — majority vote among citing docs
        anchor_domain = None
        if doc_by_id:
            dom_counts: Dict[str, int] = {}
            for app_id in applications:
                d = doc_by_id.get(app_id)
                if not d: continue
                dom = (getattr(d, "metadata", None) or {}).get("domain")
                if dom:
                    dom_counts[dom] = dom_counts.get(dom, 0) + 1
            if dom_counts:
                anchor_domain = max(dom_counts.items(), key=lambda x: x[1])[0]

        # Most-cited statutes WITHIN this cluster (signature)
        statute_counts: Dict[str, int] = {}
        for app_id in applications:
            for c in cn.cites.get(app_id, []):
                if c.get("kind") == "statute":
                    statute_counts[c["text"]] = (
                        statute_counts.get(c["text"], 0) + 1
                    )
        top_statutes = sorted(
            statute_counts.items(), key=lambda x: -x[1])[:5]
        statute_refs = [s for s, _ in top_statutes]

        # The anchor itself is virtual — use the citation string as ID.
        # Prefix with 'virtual:' so callers can detect it isn't a corpus doc.
        anchor_id = f"virtual:{cit_text}"
        clusters.append(DoctrineCluster(
            cluster_id=_hash_id(anchor_id),
            anchor_id=anchor_id,
            anchor_label=cit_text,                 # the citation string itself
            applications=applications,
            origins=[],                             # no graph-walk for virtuals
            n_in=deg,
            n_out=0,
            domain=anchor_domain,
            statute_refs=statute_refs,
            extra={"virtual": True,
                   "kind": cn.citation_meta.get(cit_text, {}).get("kind", "caselaw")},
        ))
    return clusters


def build_clusters(
    cn: CitationNetwork,
    *,
    docs: Optional[List[Any]] = None,    # for resolving anchor labels + domain
    min_in_degree: int = 3,              # minimum citations to qualify as anchor
    top_n: int = 500,                    # cap — Israel's leading caselaw is ≤500
    include_origins: bool = True,
    fallback_to_virtual: bool = True,   # if 0 in-corpus anchors, use virtuals
    augment_with_virtual: bool = False,  # produce virtual anchors ALONGSIDE in-corpus
    co_cite_expand: bool = True,        # expand sparse clusters via co-citation
    co_cite_min_overlap: int = 2,        # ≥N shared citations = co-cite neighbour
    co_cite_threshold: int = 3,          # only expand clusters with <N applications
) -> List[DoctrineCluster]:
    """Build doctrine clusters from a citation network.

    Parameters
    ----------
    cn : CitationNetwork
        The pre-built citation index.
    docs : list of Document-like objects (optional)
        Used to resolve anchor labels (metadata.title or .citation) and
        majority domain per cluster. If None, anchors are returned with
        just doc_id.
    min_in_degree : int
        A doc must have at least this many incoming citations to qualify
        as an anchor. Default 3 — empirically the threshold below which
        clusters become noisy.
    top_n : int
        Hard cap on number of anchors. The Israeli legal system has
        roughly 200-300 truly leading cases; 500 is generous.
    include_origins : bool
        If True, the cluster also includes docs the anchor itself cites
        (the origins of the doctrine — useful for tracing roots).

    Returns
    -------
    List[DoctrineCluster] sorted by anchor in-degree, highest first.
    """
    if cn.n_docs == 0:
        return []

    # Build doc lookup for anchor metadata
    doc_by_id: Dict[str, Any] = {}
    if docs:
        for d in docs:
            did = getattr(d, "id", None)
            if did:
                doc_by_id[did] = d

    # Find anchor candidates by in-degree
    deg = _anchor_in_degree(cn)
    anchors = sorted(
        ((did, n) for did, n in deg.items() if n >= min_in_degree),
        key=lambda x: -x[1],
    )[:top_n]

    clusters: List[DoctrineCluster] = []
    for anchor_id, in_deg in anchors:
        applications = _docs_citing(cn, anchor_id)
        origins = _docs_cited_by(cn, anchor_id) if include_origins else []

        # Co-citation expansion — only for sparse clusters where direct
        # citation graph isn't enough. A doc that shares ≥2 outbound
        # citations with our anchor is almost certainly arguing the same
        # doctrine even if it doesn't cite the anchor itself.
        co_cite_added: List[str] = []
        if co_cite_expand and len(applications) < co_cite_threshold:
            existing = {anchor_id, *applications, *origins}
            for cand in _docs_co_citing(
                cn, anchor_id, min_overlap=co_cite_min_overlap,
            ):
                if cand not in existing:
                    co_cite_added.append(cand)
                    existing.add(cand)
            if co_cite_added:
                applications = applications + co_cite_added

        # Anchor metadata
        anchor_label: Optional[str] = None
        anchor_domain: Optional[str] = None
        anchor_doc = doc_by_id.get(anchor_id)
        if anchor_doc is not None:
            md = (getattr(anchor_doc, "metadata", None) or {})
            anchor_label = (md.get("citation") or md.get("title")
                            or anchor_id)
            anchor_domain = md.get("domain")

        # Most-cited statutes within this cluster (signature of doctrine)
        statute_counts: Dict[str, int] = {}
        for member_id in [anchor_id, *applications, *origins]:
            for c in cn.cites.get(member_id, []):
                if c.get("kind") == "statute":
                    statute_counts[c["text"]] = (
                        statute_counts.get(c["text"], 0) + 1
                    )
        top_statutes = sorted(
            statute_counts.items(), key=lambda x: -x[1],
        )[:5]
        statute_refs = [s for s, _ in top_statutes]

        # Domain — majority vote among members (when available)
        if doc_by_id and not anchor_domain:
            dom_counts: Dict[str, int] = {}
            for member_id in [anchor_id, *applications, *origins]:
                d = doc_by_id.get(member_id)
                if not d: continue
                dom = (getattr(d, "metadata", None) or {}).get("domain")
                if dom:
                    dom_counts[dom] = dom_counts.get(dom, 0) + 1
            if dom_counts:
                anchor_domain = max(dom_counts.items(), key=lambda x: x[1])[0]

        clusters.append(DoctrineCluster(
            cluster_id=_hash_id(anchor_id),
            anchor_id=anchor_id,
            anchor_label=anchor_label,
            applications=applications,
            origins=origins,
            n_in=in_deg,
            n_out=len(origins),
            domain=anchor_domain,
            statute_refs=statute_refs,
            extra=({"co_cite_added": co_cite_added} if co_cite_added else {}),
        ))

    # Augmentation — emit virtual anchors for popular external citations
    # ALONGSIDE in-corpus clusters, then dedup against existing in-corpus
    # anchors and re-sort by influence. Off by default so the basic
    # "in-corpus only" semantics are preserved; production callers
    # (get_or_build_clusters) opt in.
    if augment_with_virtual:
        existing_anchor_ids = {c.anchor_id for c in clusters}
        virtuals = _virtual_anchor_clusters(
            cn, docs=docs,
            min_in_degree=max(2, min_in_degree),
            top_n=top_n,
        )
        for vc in virtuals:
            cit_text = vc.anchor_id[len("virtual:"):]
            target = cn.doc_for_citation.get(cit_text)
            if target and target in existing_anchor_ids:
                # Already represented by the in-corpus anchor — skip
                continue
            clusters.append(vc)
        clusters.sort(key=lambda c: -c.n_in)

    # Fallback for summary-style corpora (kolzchut, statute books, etc.)
    # where citations are mostly OUTSIDE the corpus. If we found nothing,
    # treat popular citation strings themselves as virtual anchors.
    if not clusters and fallback_to_virtual:
        clusters = _virtual_anchor_clusters(
            cn, docs=docs,
            min_in_degree=max(2, min_in_degree),  # be slightly stricter
            top_n=top_n,
        )

    return clusters


# ──────────────────────────────────────────────────────────────────────
# Query → cluster routing
# ──────────────────────────────────────────────────────────────────────

def cluster_for_query(
    clusters: List[DoctrineCluster],
    retrieved_doc_ids: List[str],
    *,
    top_k: int = 3,
    query_domain: Optional[str] = None,
    domain_match_boost: float = 1.5,
    domain_miss_penalty: float = 0.7,
    query_text: Optional[str] = None,
    # Boost ≠ 1.0 enables a query-keyword vs cluster-anchor-label match
    # boost. Default OFF (1.0) — a previous experiment with 1.3 hurt
    # more than it helped on Hebrew legal queries because common roots
    # (זכויות, חולה, הפליה) appear in many anchor_labels and amplify
    # noise more than signal. Keep the parameter so future callers can
    # opt in once a smarter token-filter is in place.
    keyword_boost: float = 1.0,
) -> List[Tuple[DoctrineCluster, float]]:
    """Given a list of doc_ids returned by retrieval (sorted by relevance),
    rank clusters by how densely they cover the retrieved set.

    Score = (# of retrieved_doc_ids in cluster) / |cluster|^0.5
    The √denominator favors compact clusters that contain many of the
    top hits; without it, a 500-member megacluster would always win.

    Domain-aware ranking
    ────────────────────
    When ``query_domain`` is provided (e.g. from the Level-7 domain
    classifier), each cluster's score is multiplied by:
      • ``domain_match_boost`` (1.5×) when ``cluster.domain == query_domain``
      • ``domain_miss_penalty`` (0.7×) when both are set but DIFFER
      • 1.0 when either side has no domain set (be conservative —
        don't penalize unlabeled clusters)

    This prevents the failure mode where a query about contracts pulls
    a torts cluster forward just because the retrievals happened to
    overlap.

    Lexical-hint boost
    ──────────────────
    When ``query_text`` is provided, query tokens of length ≥ 4 are
    matched against ``cluster.anchor_label`` and ``cluster.statute_refs``;
    a hit applies ``keyword_boost`` (1.3×). Use case: query "סיכול
    חוזה" should prefer the cluster anchored on a frustration ruling
    over a generic contracts cluster, even when retrieval overlap is
    similar. Tokens shorter than 4 chars are skipped — they're either
    function words or domain-common roots that boost everything.

    Returns top_k (cluster, score) tuples, score descending.
    """
    if not clusters or not retrieved_doc_ids:
        return []
    retrieved_set = set(retrieved_doc_ids)
    # Boost weights for higher-ranked retrievals
    rank_weight = {
        did: 1.0 / (1 + i * 0.2) for i, did in enumerate(retrieved_doc_ids)
    }
    # Pre-tokenize query keywords once. Strip punctuation conservatively;
    # punctuation rarely matters in Hebrew but apostrophes inside words
    # are kept (e.g. ע"א).
    query_keywords: List[str] = []
    if query_text:
        for tok in query_text.split():
            cleaned = tok.strip(".,;:?!()[]{}́").lower()
            if len(cleaned) >= 4:
                query_keywords.append(cleaned)

    scored: List[Tuple[DoctrineCluster, float]] = []
    for c in clusters:
        members = set(c.members())
        overlap = members & retrieved_set
        if not overlap:
            continue
        # Weighted overlap (top hits matter more than tail)
        weighted = sum(rank_weight.get(did, 0.0) for did in overlap)
        # Penalize cluster size so a 200-member cluster doesn't dominate
        # by virtue of being large
        score = weighted / (max(c.size, 1) ** 0.5)
        # Domain-aware multiplier — only when both sides have a domain
        if query_domain and c.domain:
            if c.domain == query_domain:
                score *= float(domain_match_boost)
            else:
                score *= float(domain_miss_penalty)
        # Lexical-hint boost: query keyword appears in cluster's
        # anchor label or statute references. Disabled when
        # keyword_boost == 1.0 (the default — see signature comment).
        if query_keywords and keyword_boost != 1.0:
            haystack = (
                (c.anchor_label or "") + " " + " ".join(c.statute_refs or [])
            ).lower()
            if any(kw in haystack for kw in query_keywords):
                score *= float(keyword_boost)
        scored.append((c, score))
    scored.sort(key=lambda t: -t[1])
    return scored[:top_k]


def classify_role(cluster: DoctrineCluster, doc_id: str) -> str:
    """Quick lookup — what role does doc_id play in this cluster?
    Returns 'anchor', 'application', 'origin', or 'outside'."""
    return cluster.role_of(doc_id) or "outside"


# ──────────────────────────────────────────────────────────────────────
# Pipeline integration
# ──────────────────────────────────────────────────────────────────────

def get_or_build_clusters(
    pipe,
    *,
    force_rebuild: bool = False,
    min_in_degree: Optional[int] = None,
    top_n: int = 500,
) -> List[DoctrineCluster]:
    """Cached cluster build — same pattern as citation_network.get_or_build.

    Stored on pipe._doctrine_clusters_cache. Invalidated by passing
    force_rebuild=True (e.g. after a new corpus ingest).

    `min_in_degree` is auto-tuned by default — for small corpora a
    fixed threshold of 3 finds nothing. We start at 3 for big corpora
    (10K+ docs), drop to 2 for medium (1K-10K), and 1 for tiny (<1K).
    The user can still override explicitly.
    """
    if pipe is None:
        return []
    # Empty list is a valid cached state (no clusters meet threshold).
    # `is not None` so we don't re-run an expensive build over and over.
    if (not force_rebuild and
            getattr(pipe, "_doctrine_clusters_cache", None) is not None):
        return pipe._doctrine_clusters_cache

    from .citation_network import get_or_build as _gob_cn
    cn = _gob_cn(pipe, force_rebuild=force_rebuild)
    docs = (getattr(pipe, "_indexed_docs", None) or
            getattr(pipe, "_docs", None) or [])

    # Adaptive threshold for small corpora
    if min_in_degree is None:
        n_docs = len(docs)
        if n_docs >= 10_000:
            min_in_degree = 3
        elif n_docs >= 1_000:
            min_in_degree = 2
        else:
            min_in_degree = 1   # tiny corpus — even 1 incoming citation matters

    # Augment with virtual anchors by default (lifts the eval on
    # kolzchut-heavy small corpora). On large corpora the augment loop
    # is O(citations × applications × cites_per_app) which dominates
    # boot — set TAU_RAG_CLUSTER_AUGMENT=0 to opt out.
    import os as _os
    _augment = (_os.environ.get("TAU_RAG_CLUSTER_AUGMENT", "1") != "0")
    clusters = build_clusters(
        cn, docs=docs,
        min_in_degree=min_in_degree, top_n=top_n,
        augment_with_virtual=_augment,
    )
    pipe._doctrine_clusters_cache = clusters
    # New corpus / new clusters → invalidate the meta map cache so the
    # next /v1/clusters/{id} call rebuilds with the fresh docs.
    _invalidate_meta_cache()
    return clusters


# ──────────────────────────────────────────────────────────────────────
# Inspection / debug
# ──────────────────────────────────────────────────────────────────────

# Single-slot memoization for the {doc_id: metadata} map. Cluster detail
# pages can hit resolve_cluster_members repeatedly with the SAME `docs`
# list; without this, every open is O(N) on the full corpus. Cache key
# is the docs list itself (by identity) plus its current length, so a
# replaced list or a list that grew/shrunk forces a rebuild. We use
# identity (`is`) rather than id() because Python may recycle id() after
# garbage collection. For in-place mutation of an existing doc's
# metadata, callers must explicitly invalidate via _invalidate_meta_cache().
_meta_cache: Dict[str, Any] = {"docs_ref": None, "len": None, "value": None}


def _invalidate_meta_cache() -> None:
    """Force the next _build_meta_by_id call to rebuild from scratch."""
    _meta_cache["docs_ref"] = None
    _meta_cache["len"] = None
    _meta_cache["value"] = None


def _build_meta_by_id(docs: Optional[List[Any]]) -> Dict[str, Dict[str, Any]]:
    """Memoized {doc_id: metadata} map.

    Returns the SAME dict instance on consecutive calls with the same
    `docs` list (identity match + unchanged length). Mutations to the
    returned dict persist across calls — by design, so the API layer
    can layer extra fields on once per corpus load.
    """
    if docs is None:
        return {}
    cached = _meta_cache["value"]
    if (cached is not None
            and _meta_cache["docs_ref"] is docs
            and _meta_cache["len"] == len(docs)):
        return cached
    meta_by_id: Dict[str, Dict[str, Any]] = {}
    for d in docs:
        did = getattr(d, "id", None)
        if not did:
            continue
        md = (getattr(d, "metadata", None) or {})
        meta_by_id[did] = {
            "id":     did,
            "title":  md.get("title") or md.get("citation") or did,
            "domain": md.get("domain"),
            "kind":   md.get("case_type") or md.get("kind"),
        }
    _meta_cache["docs_ref"] = docs
    _meta_cache["len"] = len(docs)
    _meta_cache["value"] = meta_by_id
    return meta_by_id


def resolve_cluster_members(
    docs: Optional[List[Any]],
    member_ids: List[str],
    *,
    co_cite_added: Optional[List[str]] = None,
) -> List[Dict[str, Any]]:
    """Resolve a list of cluster member doc IDs to friendly metadata
    rows: ``{id, title, domain, kind, is_co_cite}``.

    Used by ``/v1/clusters/{cluster_id}`` to enrich the bare ID lists
    so the frontend can render Hebrew titles instead of opaque
    pipe-internal identifiers.

    Parameters
    ----------
    docs
        Documents the pipeline has indexed (anything with ``.id`` and
        ``.metadata``). May be None / empty — every member is then
        returned as a bare ID with title=None.
    member_ids
        The ordered list of doc IDs to resolve (preserves order).
    co_cite_added
        Optional set of IDs that were pulled in via co-citation
        expansion (rather than direct citation). Each row is tagged
        with ``is_co_cite: True/False`` so the UI can render a
        provenance badge.
    """
    co_cite_set: Set[str] = set(co_cite_added or [])
    meta_by_id = _build_meta_by_id(docs)

    out: List[Dict[str, Any]] = []
    for did in member_ids:
        m = meta_by_id.get(did)
        if m is None:
            # Unknown to the pipeline — virtual anchor IDs (kolzchut
            # fallback) or anything that hasn't been indexed yet.
            out.append({
                "id":         did,
                "title":      None,
                "domain":     None,
                "kind":       None,
                "is_co_cite": did in co_cite_set,
            })
        else:
            out.append({**m, "is_co_cite": did in co_cite_set})
    return out


def find_clusters_for_doc(
    clusters: List[DoctrineCluster],
    doc_id: str,
) -> List[Tuple[DoctrineCluster, str]]:
    """Inverse routing — given a document, return every cluster it
    belongs to and the role it plays in each.

    A doc can be in multiple doctrines simultaneously: a contract case
    might be the anchor of "tom-lev" and an application of "apropim".
    This helper makes that membership explicit.

    Returns ``[(cluster, role), ...]`` where ``role`` is one of
    ``"anchor"``, ``"application"``, ``"origin"``. Order: anchors first,
    then applications, then origins, with each tier sorted by anchor
    in-degree (most-influential cluster first).
    """
    if not clusters or not doc_id:
        return []
    rows: List[Tuple[DoctrineCluster, str, int]] = []
    role_rank = {"anchor": 0, "application": 1, "origin": 2}
    for c in clusters:
        role = c.role_of(doc_id)
        if role:
            rows.append((c, role, role_rank.get(role, 3)))
    # Sort: role-tier ascending, then anchor in-degree descending
    rows.sort(key=lambda t: (t[2], -t[0].n_in))
    return [(c, role) for c, role, _ in rows]


def cluster_summary(c: DoctrineCluster) -> Dict[str, Any]:
    """JSON-friendly snapshot for /v1/clusters endpoints."""
    co_cite_added = (c.extra or {}).get("co_cite_added") or []
    return {
        "cluster_id":   c.cluster_id,
        "anchor_id":    c.anchor_id,
        "anchor_label": c.anchor_label,
        "size":         c.size,
        "n_applications": len(c.applications),
        "n_origins":      len(c.origins),
        "n_in":         c.n_in,
        "n_out":        c.n_out,
        "domain":       c.domain,
        "statute_refs": c.statute_refs,
        # Provenance — which members were added via co-citation expansion
        # vs direct citation. UI uses this to render a "co-cited" badge.
        "co_cite_added":    list(co_cite_added),
        "n_co_cite_added":  len(co_cite_added),
        # Mark virtual-anchor clusters explicitly so the UI can render
        # them with a different style (citation string is the anchor,
        # not a corpus doc).
        "is_virtual": bool((c.extra or {}).get("virtual")),
    }


__all__ = [
    "DoctrineCluster",
    "build_clusters",
    "cluster_for_query",
    "classify_role",
    "get_or_build_clusters",
    "cluster_summary",
    "resolve_cluster_members",
    "find_clusters_for_doc",
]