"""clustering.py — anchor-based doctrine clusters over the citation graph. Level 6 of the hierarchical legal graph. Builds on `citation_network` to identify "doctrine clusters" — groups of judgments that revolve around a common leading case (the anchor). Why anchor-based and not Louvain ───────────────────────────────── Louvain / Leiden / spectral clustering give mathematically clean communities but they don't map to how lawyers think. A lawyer thinks in terms of doctrines: "the rule of פרשת אבוטבול" or "the test from מגדלי הים התיכון". These doctrines have a natural anchor — the leading case — and a natural perimeter: the precedents that cite the anchor (applications), and the precedents the anchor itself relied on (origins). So a cluster, for us, is: cluster(anchor) := {anchor} ∪ docs that cite anchor (applications) ∪ docs that anchor cites (origins) This is a 1-hop ego-network around each anchor. Anchors are picked by in-degree on the citation graph — the top-K most-cited docs in each domain become anchors. A doc can belong to MULTIPLE clusters (a single judgment often sits in 2-3 doctrines). That's fine and even desired — it lets us trace cross-doctrine reasoning. What we expose ────────────── DoctrineCluster — dataclass with anchor + members + stats. build_clusters(cn, ...) → list of DoctrineCluster cluster_for_query(clusters, retrieved_ids) → ranked clusters classify_role(cluster, doc_id) → "anchor"/"application"/"origin" This is read-only over the existing CitationNetwork — no rebuild, no extra storage. The clusters are built on demand and cached on the pipeline. """ from __future__ import annotations from dataclasses import dataclass, field from typing import Any, Dict, List, Optional, Set, Tuple from .citation_network import CitationNetwork # ────────────────────────────────────────────────────────────────────── # Public dataclass # ────────────────────────────────────────────────────────────────────── @dataclass class DoctrineCluster: """A single doctrine cluster. Roles within the cluster: • anchor — the leading case (highest in-degree at build time) • applications — docs that cite the anchor (downstream propagation) • origins — docs the anchor cites (upstream foundation) • boundary — docs that overlap with this cluster but are rooted in a neighbouring anchor (computed on demand) """ cluster_id: str # stable hash of anchor_id anchor_id: str # doc_id of the leading case anchor_label: Optional[str] = None # "ע\"א 1234/22" if known applications: List[str] = field(default_factory=list) origins: List[str] = field(default_factory=list) n_in: int = 0 # incoming citations to anchor n_out: int = 0 # outgoing from anchor domain: Optional[str] = None # majority domain among members statute_refs: List[str] = field(default_factory=list) # most-cited statutes extra: Dict[str, Any] = field(default_factory=dict) @property def size(self) -> int: return 1 + len(self.applications) + len(self.origins) def members(self) -> List[str]: """All doc_ids in this cluster, anchor first.""" seen: Set[str] = set() out: List[str] = [] for did in [self.anchor_id, *self.applications, *self.origins]: if did not in seen: seen.add(did); out.append(did) return out def role_of(self, doc_id: str) -> Optional[str]: if doc_id == self.anchor_id: return "anchor" if doc_id in self.applications: return "application" if doc_id in self.origins: return "origin" return None # ────────────────────────────────────────────────────────────────────── # Build # ────────────────────────────────────────────────────────────────────── def _anchor_in_degree(cn: CitationNetwork) -> Dict[str, int]: """Compute in-degree (incoming citations) for every corpus doc. A doc's in-degree is the number of OTHER corpus docs that cite it via any citation that resolves to this doc. This is the classic "influence" signal — leading cases score high. """ deg: Dict[str, int] = {} for citation, target_doc in cn.doc_for_citation.items(): # Each citation that resolves to target_doc contributes the # number of distinct citers as in-degree if not target_doc: continue citers = cn.cited_by.get(citation, []) deg[target_doc] = deg.get(target_doc, 0) + len(set(citers)) return deg def _docs_citing(cn: CitationNetwork, doc_id: str) -> List[str]: """Find every corpus doc that cites `doc_id` (via any of its resolved citations). Mirror of citation_network.network_for_doc's cited_by computation.""" out: List[str] = [] seen: Set[str] = {doc_id} for citation, target in cn.doc_for_citation.items(): if target != doc_id: continue for citer in cn.cited_by.get(citation, []): if citer not in seen: seen.add(citer); out.append(citer) return out def _docs_cited_by(cn: CitationNetwork, doc_id: str) -> List[str]: """The corpus docs that `doc_id` cites — i.e., the resolved targets of its outgoing citations.""" out: List[str] = [] seen: Set[str] = {doc_id} for c in cn.cites.get(doc_id, []): target = cn.doc_for_citation.get(c["text"]) if target and target not in seen: seen.add(target); out.append(target) return out def _docs_co_citing( cn: CitationNetwork, doc_id: str, *, min_overlap: int = 2, ) -> List[str]: """Co-citation expansion — docs that share `min_overlap` citation strings with `doc_id`. Useful for small corpora where the standard 'cite the anchor directly' graph is sparse but docs cluster around common external precedents. Example: 5 kolzchut articles all cite both 'ע\"א 6276/95 מגדלי הים' and 'סעיף 39 לחוק החוזים'. They form a co-citation cluster even if none of them cites the OTHERS directly. """ my_cites = {c["text"] for c in cn.cites.get(doc_id, [])} if len(my_cites) < min_overlap: return [] counts: Dict[str, int] = {} for cite in my_cites: for other in cn.cited_by.get(cite, []): if other == doc_id: continue counts[other] = counts.get(other, 0) + 1 return [d for d, n in counts.items() if n >= min_overlap] def _hash_id(s: str) -> str: """Stable 12-char hash for cluster IDs.""" import hashlib return hashlib.md5(s.encode("utf-8")).hexdigest()[:12] def _virtual_anchor_clusters( cn: CitationNetwork, *, docs: Optional[List[Any]] = None, min_in_degree: int = 3, top_n: int = 500, ) -> List[DoctrineCluster]: """Fallback for corpora where citations point OUTSIDE the corpus (e.g. kolzchut articles citing court rulings the corpus doesn't contain). Strategy: treat each popular citation string as a "virtual anchor". The cluster = all corpus docs that cite that string. No `origins` (we don't have the cited doc to walk its outgoing citations). Useful for summary-style corpora. """ doc_by_id: Dict[str, Any] = {} if docs: for d in docs: did = getattr(d, "id", None) if did: doc_by_id[did] = d # Rank citation strings by how many corpus docs cite them cite_in_degree = [ (cit, len(set(citers))) for cit, citers in cn.cited_by.items() if len(set(citers)) >= min_in_degree ] cite_in_degree.sort(key=lambda x: -x[1]) cite_in_degree = cite_in_degree[:top_n] clusters: List[DoctrineCluster] = [] for cit_text, deg in cite_in_degree: applications = list(set(cn.cited_by.get(cit_text, []))) if not applications: continue # Domain — majority vote among citing docs anchor_domain = None if doc_by_id: dom_counts: Dict[str, int] = {} for app_id in applications: d = doc_by_id.get(app_id) if not d: continue dom = (getattr(d, "metadata", None) or {}).get("domain") if dom: dom_counts[dom] = dom_counts.get(dom, 0) + 1 if dom_counts: anchor_domain = max(dom_counts.items(), key=lambda x: x[1])[0] # Most-cited statutes WITHIN this cluster (signature) statute_counts: Dict[str, int] = {} for app_id in applications: for c in cn.cites.get(app_id, []): if c.get("kind") == "statute": statute_counts[c["text"]] = ( statute_counts.get(c["text"], 0) + 1 ) top_statutes = sorted( statute_counts.items(), key=lambda x: -x[1])[:5] statute_refs = [s for s, _ in top_statutes] # The anchor itself is virtual — use the citation string as ID. # Prefix with 'virtual:' so callers can detect it isn't a corpus doc. anchor_id = f"virtual:{cit_text}" clusters.append(DoctrineCluster( cluster_id=_hash_id(anchor_id), anchor_id=anchor_id, anchor_label=cit_text, # the citation string itself applications=applications, origins=[], # no graph-walk for virtuals n_in=deg, n_out=0, domain=anchor_domain, statute_refs=statute_refs, extra={"virtual": True, "kind": cn.citation_meta.get(cit_text, {}).get("kind", "caselaw")}, )) return clusters def build_clusters( cn: CitationNetwork, *, docs: Optional[List[Any]] = None, # for resolving anchor labels + domain min_in_degree: int = 3, # minimum citations to qualify as anchor top_n: int = 500, # cap — Israel's leading caselaw is ≤500 include_origins: bool = True, fallback_to_virtual: bool = True, # if 0 in-corpus anchors, use virtuals augment_with_virtual: bool = False, # produce virtual anchors ALONGSIDE in-corpus co_cite_expand: bool = True, # expand sparse clusters via co-citation co_cite_min_overlap: int = 2, # ≥N shared citations = co-cite neighbour co_cite_threshold: int = 3, # only expand clusters with List[DoctrineCluster]: """Build doctrine clusters from a citation network. Parameters ---------- cn : CitationNetwork The pre-built citation index. docs : list of Document-like objects (optional) Used to resolve anchor labels (metadata.title or .citation) and majority domain per cluster. If None, anchors are returned with just doc_id. min_in_degree : int A doc must have at least this many incoming citations to qualify as an anchor. Default 3 — empirically the threshold below which clusters become noisy. top_n : int Hard cap on number of anchors. The Israeli legal system has roughly 200-300 truly leading cases; 500 is generous. include_origins : bool If True, the cluster also includes docs the anchor itself cites (the origins of the doctrine — useful for tracing roots). Returns ------- List[DoctrineCluster] sorted by anchor in-degree, highest first. """ if cn.n_docs == 0: return [] # Build doc lookup for anchor metadata doc_by_id: Dict[str, Any] = {} if docs: for d in docs: did = getattr(d, "id", None) if did: doc_by_id[did] = d # Find anchor candidates by in-degree deg = _anchor_in_degree(cn) anchors = sorted( ((did, n) for did, n in deg.items() if n >= min_in_degree), key=lambda x: -x[1], )[:top_n] clusters: List[DoctrineCluster] = [] for anchor_id, in_deg in anchors: applications = _docs_citing(cn, anchor_id) origins = _docs_cited_by(cn, anchor_id) if include_origins else [] # Co-citation expansion — only for sparse clusters where direct # citation graph isn't enough. A doc that shares ≥2 outbound # citations with our anchor is almost certainly arguing the same # doctrine even if it doesn't cite the anchor itself. co_cite_added: List[str] = [] if co_cite_expand and len(applications) < co_cite_threshold: existing = {anchor_id, *applications, *origins} for cand in _docs_co_citing( cn, anchor_id, min_overlap=co_cite_min_overlap, ): if cand not in existing: co_cite_added.append(cand) existing.add(cand) if co_cite_added: applications = applications + co_cite_added # Anchor metadata anchor_label: Optional[str] = None anchor_domain: Optional[str] = None anchor_doc = doc_by_id.get(anchor_id) if anchor_doc is not None: md = (getattr(anchor_doc, "metadata", None) or {}) anchor_label = (md.get("citation") or md.get("title") or anchor_id) anchor_domain = md.get("domain") # Most-cited statutes within this cluster (signature of doctrine) statute_counts: Dict[str, int] = {} for member_id in [anchor_id, *applications, *origins]: for c in cn.cites.get(member_id, []): if c.get("kind") == "statute": statute_counts[c["text"]] = ( statute_counts.get(c["text"], 0) + 1 ) top_statutes = sorted( statute_counts.items(), key=lambda x: -x[1], )[:5] statute_refs = [s for s, _ in top_statutes] # Domain — majority vote among members (when available) if doc_by_id and not anchor_domain: dom_counts: Dict[str, int] = {} for member_id in [anchor_id, *applications, *origins]: d = doc_by_id.get(member_id) if not d: continue dom = (getattr(d, "metadata", None) or {}).get("domain") if dom: dom_counts[dom] = dom_counts.get(dom, 0) + 1 if dom_counts: anchor_domain = max(dom_counts.items(), key=lambda x: x[1])[0] clusters.append(DoctrineCluster( cluster_id=_hash_id(anchor_id), anchor_id=anchor_id, anchor_label=anchor_label, applications=applications, origins=origins, n_in=in_deg, n_out=len(origins), domain=anchor_domain, statute_refs=statute_refs, extra=({"co_cite_added": co_cite_added} if co_cite_added else {}), )) # Augmentation — emit virtual anchors for popular external citations # ALONGSIDE in-corpus clusters, then dedup against existing in-corpus # anchors and re-sort by influence. Off by default so the basic # "in-corpus only" semantics are preserved; production callers # (get_or_build_clusters) opt in. if augment_with_virtual: existing_anchor_ids = {c.anchor_id for c in clusters} virtuals = _virtual_anchor_clusters( cn, docs=docs, min_in_degree=max(2, min_in_degree), top_n=top_n, ) for vc in virtuals: cit_text = vc.anchor_id[len("virtual:"):] target = cn.doc_for_citation.get(cit_text) if target and target in existing_anchor_ids: # Already represented by the in-corpus anchor — skip continue clusters.append(vc) clusters.sort(key=lambda c: -c.n_in) # Fallback for summary-style corpora (kolzchut, statute books, etc.) # where citations are mostly OUTSIDE the corpus. If we found nothing, # treat popular citation strings themselves as virtual anchors. if not clusters and fallback_to_virtual: clusters = _virtual_anchor_clusters( cn, docs=docs, min_in_degree=max(2, min_in_degree), # be slightly stricter top_n=top_n, ) return clusters # ────────────────────────────────────────────────────────────────────── # Query → cluster routing # ────────────────────────────────────────────────────────────────────── def cluster_for_query( clusters: List[DoctrineCluster], retrieved_doc_ids: List[str], *, top_k: int = 3, query_domain: Optional[str] = None, domain_match_boost: float = 1.5, domain_miss_penalty: float = 0.7, query_text: Optional[str] = None, # Boost ≠ 1.0 enables a query-keyword vs cluster-anchor-label match # boost. Default OFF (1.0) — a previous experiment with 1.3 hurt # more than it helped on Hebrew legal queries because common roots # (זכויות, חולה, הפליה) appear in many anchor_labels and amplify # noise more than signal. Keep the parameter so future callers can # opt in once a smarter token-filter is in place. keyword_boost: float = 1.0, ) -> List[Tuple[DoctrineCluster, float]]: """Given a list of doc_ids returned by retrieval (sorted by relevance), rank clusters by how densely they cover the retrieved set. Score = (# of retrieved_doc_ids in cluster) / |cluster|^0.5 The √denominator favors compact clusters that contain many of the top hits; without it, a 500-member megacluster would always win. Domain-aware ranking ──────────────────── When ``query_domain`` is provided (e.g. from the Level-7 domain classifier), each cluster's score is multiplied by: • ``domain_match_boost`` (1.5×) when ``cluster.domain == query_domain`` • ``domain_miss_penalty`` (0.7×) when both are set but DIFFER • 1.0 when either side has no domain set (be conservative — don't penalize unlabeled clusters) This prevents the failure mode where a query about contracts pulls a torts cluster forward just because the retrievals happened to overlap. Lexical-hint boost ────────────────── When ``query_text`` is provided, query tokens of length ≥ 4 are matched against ``cluster.anchor_label`` and ``cluster.statute_refs``; a hit applies ``keyword_boost`` (1.3×). Use case: query "סיכול חוזה" should prefer the cluster anchored on a frustration ruling over a generic contracts cluster, even when retrieval overlap is similar. Tokens shorter than 4 chars are skipped — they're either function words or domain-common roots that boost everything. Returns top_k (cluster, score) tuples, score descending. """ if not clusters or not retrieved_doc_ids: return [] retrieved_set = set(retrieved_doc_ids) # Boost weights for higher-ranked retrievals rank_weight = { did: 1.0 / (1 + i * 0.2) for i, did in enumerate(retrieved_doc_ids) } # Pre-tokenize query keywords once. Strip punctuation conservatively; # punctuation rarely matters in Hebrew but apostrophes inside words # are kept (e.g. ע"א). query_keywords: List[str] = [] if query_text: for tok in query_text.split(): cleaned = tok.strip(".,;:?!()[]{}́").lower() if len(cleaned) >= 4: query_keywords.append(cleaned) scored: List[Tuple[DoctrineCluster, float]] = [] for c in clusters: members = set(c.members()) overlap = members & retrieved_set if not overlap: continue # Weighted overlap (top hits matter more than tail) weighted = sum(rank_weight.get(did, 0.0) for did in overlap) # Penalize cluster size so a 200-member cluster doesn't dominate # by virtue of being large score = weighted / (max(c.size, 1) ** 0.5) # Domain-aware multiplier — only when both sides have a domain if query_domain and c.domain: if c.domain == query_domain: score *= float(domain_match_boost) else: score *= float(domain_miss_penalty) # Lexical-hint boost: query keyword appears in cluster's # anchor label or statute references. Disabled when # keyword_boost == 1.0 (the default — see signature comment). if query_keywords and keyword_boost != 1.0: haystack = ( (c.anchor_label or "") + " " + " ".join(c.statute_refs or []) ).lower() if any(kw in haystack for kw in query_keywords): score *= float(keyword_boost) scored.append((c, score)) scored.sort(key=lambda t: -t[1]) return scored[:top_k] def classify_role(cluster: DoctrineCluster, doc_id: str) -> str: """Quick lookup — what role does doc_id play in this cluster? Returns 'anchor', 'application', 'origin', or 'outside'.""" return cluster.role_of(doc_id) or "outside" # ────────────────────────────────────────────────────────────────────── # Pipeline integration # ────────────────────────────────────────────────────────────────────── def get_or_build_clusters( pipe, *, force_rebuild: bool = False, min_in_degree: Optional[int] = None, top_n: int = 500, ) -> List[DoctrineCluster]: """Cached cluster build — same pattern as citation_network.get_or_build. Stored on pipe._doctrine_clusters_cache. Invalidated by passing force_rebuild=True (e.g. after a new corpus ingest). `min_in_degree` is auto-tuned by default — for small corpora a fixed threshold of 3 finds nothing. We start at 3 for big corpora (10K+ docs), drop to 2 for medium (1K-10K), and 1 for tiny (<1K). The user can still override explicitly. """ if pipe is None: return [] # Empty list is a valid cached state (no clusters meet threshold). # `is not None` so we don't re-run an expensive build over and over. if (not force_rebuild and getattr(pipe, "_doctrine_clusters_cache", None) is not None): return pipe._doctrine_clusters_cache from .citation_network import get_or_build as _gob_cn cn = _gob_cn(pipe, force_rebuild=force_rebuild) docs = (getattr(pipe, "_indexed_docs", None) or getattr(pipe, "_docs", None) or []) # Adaptive threshold for small corpora if min_in_degree is None: n_docs = len(docs) if n_docs >= 10_000: min_in_degree = 3 elif n_docs >= 1_000: min_in_degree = 2 else: min_in_degree = 1 # tiny corpus — even 1 incoming citation matters # Augment with virtual anchors by default (lifts the eval on # kolzchut-heavy small corpora). On large corpora the augment loop # is O(citations × applications × cites_per_app) which dominates # boot — set TAU_RAG_CLUSTER_AUGMENT=0 to opt out. import os as _os _augment = (_os.environ.get("TAU_RAG_CLUSTER_AUGMENT", "1") != "0") clusters = build_clusters( cn, docs=docs, min_in_degree=min_in_degree, top_n=top_n, augment_with_virtual=_augment, ) pipe._doctrine_clusters_cache = clusters # New corpus / new clusters → invalidate the meta map cache so the # next /v1/clusters/{id} call rebuilds with the fresh docs. _invalidate_meta_cache() return clusters # ────────────────────────────────────────────────────────────────────── # Inspection / debug # ────────────────────────────────────────────────────────────────────── # Single-slot memoization for the {doc_id: metadata} map. Cluster detail # pages can hit resolve_cluster_members repeatedly with the SAME `docs` # list; without this, every open is O(N) on the full corpus. Cache key # is the docs list itself (by identity) plus its current length, so a # replaced list or a list that grew/shrunk forces a rebuild. We use # identity (`is`) rather than id() because Python may recycle id() after # garbage collection. For in-place mutation of an existing doc's # metadata, callers must explicitly invalidate via _invalidate_meta_cache(). _meta_cache: Dict[str, Any] = {"docs_ref": None, "len": None, "value": None} def _invalidate_meta_cache() -> None: """Force the next _build_meta_by_id call to rebuild from scratch.""" _meta_cache["docs_ref"] = None _meta_cache["len"] = None _meta_cache["value"] = None def _build_meta_by_id(docs: Optional[List[Any]]) -> Dict[str, Dict[str, Any]]: """Memoized {doc_id: metadata} map. Returns the SAME dict instance on consecutive calls with the same `docs` list (identity match + unchanged length). Mutations to the returned dict persist across calls — by design, so the API layer can layer extra fields on once per corpus load. """ if docs is None: return {} cached = _meta_cache["value"] if (cached is not None and _meta_cache["docs_ref"] is docs and _meta_cache["len"] == len(docs)): return cached meta_by_id: Dict[str, Dict[str, Any]] = {} for d in docs: did = getattr(d, "id", None) if not did: continue md = (getattr(d, "metadata", None) or {}) meta_by_id[did] = { "id": did, "title": md.get("title") or md.get("citation") or did, "domain": md.get("domain"), "kind": md.get("case_type") or md.get("kind"), } _meta_cache["docs_ref"] = docs _meta_cache["len"] = len(docs) _meta_cache["value"] = meta_by_id return meta_by_id def resolve_cluster_members( docs: Optional[List[Any]], member_ids: List[str], *, co_cite_added: Optional[List[str]] = None, ) -> List[Dict[str, Any]]: """Resolve a list of cluster member doc IDs to friendly metadata rows: ``{id, title, domain, kind, is_co_cite}``. Used by ``/v1/clusters/{cluster_id}`` to enrich the bare ID lists so the frontend can render Hebrew titles instead of opaque pipe-internal identifiers. Parameters ---------- docs Documents the pipeline has indexed (anything with ``.id`` and ``.metadata``). May be None / empty — every member is then returned as a bare ID with title=None. member_ids The ordered list of doc IDs to resolve (preserves order). co_cite_added Optional set of IDs that were pulled in via co-citation expansion (rather than direct citation). Each row is tagged with ``is_co_cite: True/False`` so the UI can render a provenance badge. """ co_cite_set: Set[str] = set(co_cite_added or []) meta_by_id = _build_meta_by_id(docs) out: List[Dict[str, Any]] = [] for did in member_ids: m = meta_by_id.get(did) if m is None: # Unknown to the pipeline — virtual anchor IDs (kolzchut # fallback) or anything that hasn't been indexed yet. out.append({ "id": did, "title": None, "domain": None, "kind": None, "is_co_cite": did in co_cite_set, }) else: out.append({**m, "is_co_cite": did in co_cite_set}) return out def find_clusters_for_doc( clusters: List[DoctrineCluster], doc_id: str, ) -> List[Tuple[DoctrineCluster, str]]: """Inverse routing — given a document, return every cluster it belongs to and the role it plays in each. A doc can be in multiple doctrines simultaneously: a contract case might be the anchor of "tom-lev" and an application of "apropim". This helper makes that membership explicit. Returns ``[(cluster, role), ...]`` where ``role`` is one of ``"anchor"``, ``"application"``, ``"origin"``. Order: anchors first, then applications, then origins, with each tier sorted by anchor in-degree (most-influential cluster first). """ if not clusters or not doc_id: return [] rows: List[Tuple[DoctrineCluster, str, int]] = [] role_rank = {"anchor": 0, "application": 1, "origin": 2} for c in clusters: role = c.role_of(doc_id) if role: rows.append((c, role, role_rank.get(role, 3))) # Sort: role-tier ascending, then anchor in-degree descending rows.sort(key=lambda t: (t[2], -t[0].n_in)) return [(c, role) for c, role, _ in rows] def cluster_summary(c: DoctrineCluster) -> Dict[str, Any]: """JSON-friendly snapshot for /v1/clusters endpoints.""" co_cite_added = (c.extra or {}).get("co_cite_added") or [] return { "cluster_id": c.cluster_id, "anchor_id": c.anchor_id, "anchor_label": c.anchor_label, "size": c.size, "n_applications": len(c.applications), "n_origins": len(c.origins), "n_in": c.n_in, "n_out": c.n_out, "domain": c.domain, "statute_refs": c.statute_refs, # Provenance — which members were added via co-citation expansion # vs direct citation. UI uses this to render a "co-cited" badge. "co_cite_added": list(co_cite_added), "n_co_cite_added": len(co_cite_added), # Mark virtual-anchor clusters explicitly so the UI can render # them with a different style (citation string is the anchor, # not a corpus doc). "is_virtual": bool((c.extra or {}).get("virtual")), } __all__ = [ "DoctrineCluster", "build_clusters", "cluster_for_query", "classify_role", "get_or_build_clusters", "cluster_summary", "resolve_cluster_members", "find_clusters_for_doc", ]