Spaces:

MataStrategy
/

ground-zero

Sleeping

jefffffff9 Claude Opus 4.7 commited on Apr 28

Commit

5815492

1 Parent(s): 757e833

Phrasebook: multi-alias + French source keys, plus misses-script stub

Two-track vocabulary enrichment that keeps the matcher and UI unchanged:

1. Multi-alias entries. Each curated row may now carry any combination of
`source`, `sources` (list), `source_fr`, `sources_fr` (list). The loader
flattens them into one match-candidate per alias, all pointing at the
same target translation. A typed paraphrase ("morning, sir") or French
equivalent ("bonjour") hits the same Bambara/Pular target as the
canonical English source. Existing single-`source` rows keep working
unchanged (verified: 110 rows → 110 candidates, exact match preserved).

2. top_k now dedupes by canonical target so multi-alias rows can't crowd
out the RAG few-shot slots.

3. scripts/phrasebook_misses.py — stub for the data-driven curation loop.
Reads data/field_turns.jsonl, groups translate-phase rows where
phrasebook=null, prints the most frequent unmet inputs. Filters by
--lang and --since. --draft (LLM-proposed target per miss) is a
marked TODO — wire it when we want a tighter loop.

The actual curation work (adding French keys / aliases to existing JSON
entries) is now data, not code.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

scripts/phrasebook_misses.py +125 -0
src/llm/phrasebook.py +66 -8

scripts/phrasebook_misses.py ADDED Viewed

	@@ -0,0 +1,125 @@

+"""Surface frequent phrasebook misses from field telemetry.
+Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs
+that produced `phrasebook=null` so you can decide which ones deserve a
+curated entry. This is the data-driven side of vocabulary growth: instead
+of guessing what to translate next, look at what real users typed.
+Usage:
+    python scripts/phrasebook_misses.py                 # top-20 misses, all langs
+    python scripts/phrasebook_misses.py --lang ful      # filter to one target lang
+    python scripts/phrasebook_misses.py --top 50        # show more
+    python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts
+    python scripts/phrasebook_misses.py --draft         # also draft an LLM target
+                                                        #   (HF_TOKEN required)
+Output is plain text — eyeball the list, decide what's worth adding, and
+paste curated translations into:
+    configs/dialect_anchors/{bambara,pular}_phrasebook.json
+Stub status: counting + grouping is fully wired. The --draft flag is a
+TODO — it'll call MinimalClient on each miss and propose a target string
+for you to review. Land that when we want a tighter loop.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Optional
+# Default location matches TurnLogger's default.
+_REPO_ROOT = Path(__file__).resolve().parent.parent
+_DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl"
+def _norm(text: str) -> str:
+    """Cheap normalisation for grouping. Same intent as phrasebook._normalize
+    but kept local so this script has no project import dependency."""
+    return " ".join((text or "").lower().strip().split())
+def load_misses(
+    path: Path,
+    lang: Optional[str] = None,
+    since: Optional[str] = None,
+) -> Counter:
+    """Return Counter of normalised user inputs that produced phrasebook=null.
+    Filters:
+        lang  — only count rows whose `output_lang` equals this (e.g. "ful").
+        since — only count rows whose `ts` starts with this string (ISO prefix).
+    """
+    if not path.exists():
+        print(f"No telemetry file at {path} — run the app to produce some turns first.",
+              file=sys.stderr)
+        return Counter()
+    counts: Counter = Counter()
+    with path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                row = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            # Only translate-phase rows are useful — reply-phase rows always
+            # have phrasebook=null by construction.
+            if row.get("phase") not in (None, "translate"):
+                continue
+            if row.get("phrasebook") is not None:
+                continue
+            if lang and row.get("output_lang") != lang:
+                continue
+            if since and not (row.get("ts") or "").startswith(since):
+                continue
+            text = row.get("user_text") or row.get("transcript") or ""
+            text = _norm(text)
+            if text and not text.startswith("("):  # skip our own sentinels
+                counts[text] += 1
+    return counts
+def main() -> None:
+    ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)),
+                    help="Path to field_turns.jsonl (default: data/field_turns.jsonl)")
+    ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None,
+                    help="Filter to one output language")
+    ap.add_argument("--since", default=None,
+                    help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"')
+    ap.add_argument("--top", type=int, default=20, help="How many misses to print")
+    ap.add_argument("--draft", action="store_true",
+                    help="(TODO) Also draft an LLM-proposed target for each miss")
+    args = ap.parse_args()
+    counts = load_misses(Path(args.path), lang=args.lang, since=args.since)
+    if not counts:
+        print("No misses found with the current filters.")
+        return
+    total = sum(counts.values())
+    distinct = len(counts)
+    print(f"{total} miss-events across {distinct} distinct inputs"
+          + (f" (lang={args.lang})" if args.lang else "")
+          + (f" since {args.since}" if args.since else "")
+          + ".\n")
+    print(f"{'count':>5}  input")
+    print(f"{'-----':>5}  -----")
+    for text, n in counts.most_common(args.top):
+        print(f"{n:>5}  {text}")
+    if args.draft:
+        # TODO: import MinimalClient and call .chat(text, target_lang=args.lang)
+        # for each top input, printing the proposed target alongside. Skipping
+        # now to keep the stub dependency-free.
+        print("\n[--draft is a stub — not yet implemented]")
+if __name__ == "__main__":
+    main()

src/llm/phrasebook.py CHANGED Viewed

@@ -11,9 +11,11 @@ Purpose
 Scope
     - Only fires when target language is bam or ful. For en/fr output we let
       the LLM (or a passthrough) handle it — nothing to short-circuit.
-    - Only English source keys (what the curated sheets contain). French or
-      in-language inputs will not match and will fall through to the LLM —
-      that's correct behaviour.
 Matching
     - Exact match on normalised string → score 1.0 ("exact").
@@ -58,8 +60,38 @@ def _normalize(text: str) -> str:
     return text.strip()
 @lru_cache(maxsize=4)
 def _load_phrasebook(lang: str) -> list[dict]:
     fname = _PHRASEBOOK_FILE.get(lang)
     if not fname:
         return []
@@ -70,10 +102,29 @@ def _load_phrasebook(lang: str) -> list[dict]:
     with path.open("r", encoding="utf-8") as f:
         data = json.load(f)
     pairs = data.get("pairs", [])
-    # Precompute normalised source for speed.
     for p in pairs:
-        p["_norm"] = _normalize(p.get("source", ""))
-    return pairs
 def lookup(
@@ -145,11 +196,18 @@ def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
         scored.append((score, p))
     scored.sort(key=lambda x: x[0], reverse=True)
     out: list[dict] = []
-    for score, p in scored[:k]:
         out.append({
             "source":   p.get("source"),
-            "target":   p.get("target"),
             "category": p.get("category"),
             "score":    round(score, 3),
         })
     return out

 Scope
     - Only fires when target language is bam or ful. For en/fr output we let
       the LLM (or a passthrough) handle it — nothing to short-circuit.
+    - Source keys can be English and/or French, single or multi-alias. Each
+      curated row may carry any combination of `source`, `sources` (list),
+      `source_fr`, `sources_fr` (list); the loader flattens them into one
+      match-candidate per alias so a typed paraphrase or a French equivalent
+      hits the same target translation.
 Matching
     - Exact match on normalised string → score 1.0 ("exact").
     return text.strip()
+def _expand_aliases(entry: dict) -> list[str]:
+    """Collect every source-alias on an entry across both languages.
+    Schema (all fields optional, additive — existing single-`source` rows
+    keep working unchanged):
+        source      : "good morning"           # canonical English
+        sources     : ["morning", "morning!"]  # English aliases / paraphrases
+        source_fr   : "bonjour"                # canonical French
+        sources_fr  : ["salut", "bonjour à tous"]
+    """
+    out: list[str] = []
+    for key in ("source", "source_fr"):
+        v = entry.get(key)
+        if isinstance(v, str) and v.strip():
+            out.append(v)
+    for key in ("sources", "sources_fr"):
+        vs = entry.get(key)
+        if isinstance(vs, list):
+            out.extend(x for x in vs if isinstance(x, str) and x.strip())
+    return out
 @lru_cache(maxsize=4)
 def _load_phrasebook(lang: str) -> list[dict]:
+    """Load and flatten a phrasebook into one match-candidate per alias.
+    Each candidate carries the canonical source/target/category for display
+    and a precomputed normalised alias (`_norm`) for the matcher to compare
+    against. One curated row with N aliases produces N candidates that all
+    point at the same target translation — the matcher picks the closest
+    alias and returns the canonical entry.
+    """
     fname = _PHRASEBOOK_FILE.get(lang)
     if not fname:
         return []
     with path.open("r", encoding="utf-8") as f:
         data = json.load(f)
     pairs = data.get("pairs", [])
+    candidates: list[dict] = []
     for p in pairs:
+        target = p.get("target", "")
+        category = p.get("category")
+        # Canonical source for display: prefer English `source`, else first
+        # English `sources`, else French canonical, else first French alias.
+        canonical = (
+            p.get("source")
+            or (p.get("sources") or [None])[0]
+            or p.get("source_fr")
+            or (p.get("sources_fr") or [None])[0]
+            or ""
+        )
+        for alias in _expand_aliases(p):
+            candidates.append({
+                "source":   canonical or alias,
+                "target":   target,
+                "category": category,
+                "_alias":   alias,
+                "_norm":    _normalize(alias),
+            })
+    return candidates
 def lookup(
         scored.append((score, p))
     scored.sort(key=lambda x: x[0], reverse=True)
     out: list[dict] = []
+    seen: set[str] = set()  # dedupe: aliases of the same row → one slot
+    for score, p in scored:
+        target = p.get("target") or ""
+        if target in seen:
+            continue
+        seen.add(target)
         out.append({
             "source":   p.get("source"),
+            "target":   target,
             "category": p.get("category"),
             "score":    round(score, 3),
         })
+        if len(out) >= k:
+            break
     return out