Spaces:
Sleeping
Phrasebook: multi-alias + French source keys, plus misses-script stub
Browse filesTwo-track vocabulary enrichment that keeps the matcher and UI unchanged:
1. Multi-alias entries. Each curated row may now carry any combination of
`source`, `sources` (list), `source_fr`, `sources_fr` (list). The loader
flattens them into one match-candidate per alias, all pointing at the
same target translation. A typed paraphrase ("morning, sir") or French
equivalent ("bonjour") hits the same Bambara/Pular target as the
canonical English source. Existing single-`source` rows keep working
unchanged (verified: 110 rows → 110 candidates, exact match preserved).
2. top_k now dedupes by canonical target so multi-alias rows can't crowd
out the RAG few-shot slots.
3. scripts/phrasebook_misses.py — stub for the data-driven curation loop.
Reads data/field_turns.jsonl, groups translate-phase rows where
phrasebook=null, prints the most frequent unmet inputs. Filters by
--lang and --since. --draft (LLM-proposed target per miss) is a
marked TODO — wire it when we want a tighter loop.
The actual curation work (adding French keys / aliases to existing JSON
entries) is now data, not code.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- scripts/phrasebook_misses.py +125 -0
- src/llm/phrasebook.py +66 -8
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Surface frequent phrasebook misses from field telemetry.
|
| 2 |
+
|
| 3 |
+
Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs
|
| 4 |
+
that produced `phrasebook=null` so you can decide which ones deserve a
|
| 5 |
+
curated entry. This is the data-driven side of vocabulary growth: instead
|
| 6 |
+
of guessing what to translate next, look at what real users typed.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python scripts/phrasebook_misses.py # top-20 misses, all langs
|
| 10 |
+
python scripts/phrasebook_misses.py --lang ful # filter to one target lang
|
| 11 |
+
python scripts/phrasebook_misses.py --top 50 # show more
|
| 12 |
+
python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts
|
| 13 |
+
python scripts/phrasebook_misses.py --draft # also draft an LLM target
|
| 14 |
+
# (HF_TOKEN required)
|
| 15 |
+
|
| 16 |
+
Output is plain text — eyeball the list, decide what's worth adding, and
|
| 17 |
+
paste curated translations into:
|
| 18 |
+
configs/dialect_anchors/{bambara,pular}_phrasebook.json
|
| 19 |
+
|
| 20 |
+
Stub status: counting + grouping is fully wired. The --draft flag is a
|
| 21 |
+
TODO — it'll call MinimalClient on each miss and propose a target string
|
| 22 |
+
for you to review. Land that when we want a tighter loop.
|
| 23 |
+
"""
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import argparse
|
| 27 |
+
import json
|
| 28 |
+
import os
|
| 29 |
+
import sys
|
| 30 |
+
from collections import Counter
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
from typing import Optional
|
| 33 |
+
|
| 34 |
+
# Default location matches TurnLogger's default.
|
| 35 |
+
_REPO_ROOT = Path(__file__).resolve().parent.parent
|
| 36 |
+
_DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl"
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _norm(text: str) -> str:
|
| 40 |
+
"""Cheap normalisation for grouping. Same intent as phrasebook._normalize
|
| 41 |
+
but kept local so this script has no project import dependency."""
|
| 42 |
+
return " ".join((text or "").lower().strip().split())
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def load_misses(
|
| 46 |
+
path: Path,
|
| 47 |
+
lang: Optional[str] = None,
|
| 48 |
+
since: Optional[str] = None,
|
| 49 |
+
) -> Counter:
|
| 50 |
+
"""Return Counter of normalised user inputs that produced phrasebook=null.
|
| 51 |
+
|
| 52 |
+
Filters:
|
| 53 |
+
lang — only count rows whose `output_lang` equals this (e.g. "ful").
|
| 54 |
+
since — only count rows whose `ts` starts with this string (ISO prefix).
|
| 55 |
+
"""
|
| 56 |
+
if not path.exists():
|
| 57 |
+
print(f"No telemetry file at {path} — run the app to produce some turns first.",
|
| 58 |
+
file=sys.stderr)
|
| 59 |
+
return Counter()
|
| 60 |
+
|
| 61 |
+
counts: Counter = Counter()
|
| 62 |
+
with path.open("r", encoding="utf-8") as fh:
|
| 63 |
+
for line in fh:
|
| 64 |
+
line = line.strip()
|
| 65 |
+
if not line:
|
| 66 |
+
continue
|
| 67 |
+
try:
|
| 68 |
+
row = json.loads(line)
|
| 69 |
+
except json.JSONDecodeError:
|
| 70 |
+
continue
|
| 71 |
+
# Only translate-phase rows are useful — reply-phase rows always
|
| 72 |
+
# have phrasebook=null by construction.
|
| 73 |
+
if row.get("phase") not in (None, "translate"):
|
| 74 |
+
continue
|
| 75 |
+
if row.get("phrasebook") is not None:
|
| 76 |
+
continue
|
| 77 |
+
if lang and row.get("output_lang") != lang:
|
| 78 |
+
continue
|
| 79 |
+
if since and not (row.get("ts") or "").startswith(since):
|
| 80 |
+
continue
|
| 81 |
+
text = row.get("user_text") or row.get("transcript") or ""
|
| 82 |
+
text = _norm(text)
|
| 83 |
+
if text and not text.startswith("("): # skip our own sentinels
|
| 84 |
+
counts[text] += 1
|
| 85 |
+
return counts
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def main() -> None:
|
| 89 |
+
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
| 90 |
+
ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)),
|
| 91 |
+
help="Path to field_turns.jsonl (default: data/field_turns.jsonl)")
|
| 92 |
+
ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None,
|
| 93 |
+
help="Filter to one output language")
|
| 94 |
+
ap.add_argument("--since", default=None,
|
| 95 |
+
help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"')
|
| 96 |
+
ap.add_argument("--top", type=int, default=20, help="How many misses to print")
|
| 97 |
+
ap.add_argument("--draft", action="store_true",
|
| 98 |
+
help="(TODO) Also draft an LLM-proposed target for each miss")
|
| 99 |
+
args = ap.parse_args()
|
| 100 |
+
|
| 101 |
+
counts = load_misses(Path(args.path), lang=args.lang, since=args.since)
|
| 102 |
+
if not counts:
|
| 103 |
+
print("No misses found with the current filters.")
|
| 104 |
+
return
|
| 105 |
+
|
| 106 |
+
total = sum(counts.values())
|
| 107 |
+
distinct = len(counts)
|
| 108 |
+
print(f"{total} miss-events across {distinct} distinct inputs"
|
| 109 |
+
+ (f" (lang={args.lang})" if args.lang else "")
|
| 110 |
+
+ (f" since {args.since}" if args.since else "")
|
| 111 |
+
+ ".\n")
|
| 112 |
+
print(f"{'count':>5} input")
|
| 113 |
+
print(f"{'-----':>5} -----")
|
| 114 |
+
for text, n in counts.most_common(args.top):
|
| 115 |
+
print(f"{n:>5} {text}")
|
| 116 |
+
|
| 117 |
+
if args.draft:
|
| 118 |
+
# TODO: import MinimalClient and call .chat(text, target_lang=args.lang)
|
| 119 |
+
# for each top input, printing the proposed target alongside. Skipping
|
| 120 |
+
# now to keep the stub dependency-free.
|
| 121 |
+
print("\n[--draft is a stub — not yet implemented]")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
main()
|
|
@@ -11,9 +11,11 @@ Purpose
|
|
| 11 |
Scope
|
| 12 |
- Only fires when target language is bam or ful. For en/fr output we let
|
| 13 |
the LLM (or a passthrough) handle it — nothing to short-circuit.
|
| 14 |
-
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
|
| 18 |
Matching
|
| 19 |
- Exact match on normalised string → score 1.0 ("exact").
|
|
@@ -58,8 +60,38 @@ def _normalize(text: str) -> str:
|
|
| 58 |
return text.strip()
|
| 59 |
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
@lru_cache(maxsize=4)
|
| 62 |
def _load_phrasebook(lang: str) -> list[dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
fname = _PHRASEBOOK_FILE.get(lang)
|
| 64 |
if not fname:
|
| 65 |
return []
|
|
@@ -70,10 +102,29 @@ def _load_phrasebook(lang: str) -> list[dict]:
|
|
| 70 |
with path.open("r", encoding="utf-8") as f:
|
| 71 |
data = json.load(f)
|
| 72 |
pairs = data.get("pairs", [])
|
| 73 |
-
|
|
|
|
| 74 |
for p in pairs:
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
def lookup(
|
|
@@ -145,11 +196,18 @@ def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
|
|
| 145 |
scored.append((score, p))
|
| 146 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 147 |
out: list[dict] = []
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
out.append({
|
| 150 |
"source": p.get("source"),
|
| 151 |
-
"target":
|
| 152 |
"category": p.get("category"),
|
| 153 |
"score": round(score, 3),
|
| 154 |
})
|
|
|
|
|
|
|
| 155 |
return out
|
|
|
|
| 11 |
Scope
|
| 12 |
- Only fires when target language is bam or ful. For en/fr output we let
|
| 13 |
the LLM (or a passthrough) handle it — nothing to short-circuit.
|
| 14 |
+
- Source keys can be English and/or French, single or multi-alias. Each
|
| 15 |
+
curated row may carry any combination of `source`, `sources` (list),
|
| 16 |
+
`source_fr`, `sources_fr` (list); the loader flattens them into one
|
| 17 |
+
match-candidate per alias so a typed paraphrase or a French equivalent
|
| 18 |
+
hits the same target translation.
|
| 19 |
|
| 20 |
Matching
|
| 21 |
- Exact match on normalised string → score 1.0 ("exact").
|
|
|
|
| 60 |
return text.strip()
|
| 61 |
|
| 62 |
|
| 63 |
+
def _expand_aliases(entry: dict) -> list[str]:
|
| 64 |
+
"""Collect every source-alias on an entry across both languages.
|
| 65 |
+
|
| 66 |
+
Schema (all fields optional, additive — existing single-`source` rows
|
| 67 |
+
keep working unchanged):
|
| 68 |
+
source : "good morning" # canonical English
|
| 69 |
+
sources : ["morning", "morning!"] # English aliases / paraphrases
|
| 70 |
+
source_fr : "bonjour" # canonical French
|
| 71 |
+
sources_fr : ["salut", "bonjour à tous"]
|
| 72 |
+
"""
|
| 73 |
+
out: list[str] = []
|
| 74 |
+
for key in ("source", "source_fr"):
|
| 75 |
+
v = entry.get(key)
|
| 76 |
+
if isinstance(v, str) and v.strip():
|
| 77 |
+
out.append(v)
|
| 78 |
+
for key in ("sources", "sources_fr"):
|
| 79 |
+
vs = entry.get(key)
|
| 80 |
+
if isinstance(vs, list):
|
| 81 |
+
out.extend(x for x in vs if isinstance(x, str) and x.strip())
|
| 82 |
+
return out
|
| 83 |
+
|
| 84 |
+
|
| 85 |
@lru_cache(maxsize=4)
|
| 86 |
def _load_phrasebook(lang: str) -> list[dict]:
|
| 87 |
+
"""Load and flatten a phrasebook into one match-candidate per alias.
|
| 88 |
+
|
| 89 |
+
Each candidate carries the canonical source/target/category for display
|
| 90 |
+
and a precomputed normalised alias (`_norm`) for the matcher to compare
|
| 91 |
+
against. One curated row with N aliases produces N candidates that all
|
| 92 |
+
point at the same target translation — the matcher picks the closest
|
| 93 |
+
alias and returns the canonical entry.
|
| 94 |
+
"""
|
| 95 |
fname = _PHRASEBOOK_FILE.get(lang)
|
| 96 |
if not fname:
|
| 97 |
return []
|
|
|
|
| 102 |
with path.open("r", encoding="utf-8") as f:
|
| 103 |
data = json.load(f)
|
| 104 |
pairs = data.get("pairs", [])
|
| 105 |
+
|
| 106 |
+
candidates: list[dict] = []
|
| 107 |
for p in pairs:
|
| 108 |
+
target = p.get("target", "")
|
| 109 |
+
category = p.get("category")
|
| 110 |
+
# Canonical source for display: prefer English `source`, else first
|
| 111 |
+
# English `sources`, else French canonical, else first French alias.
|
| 112 |
+
canonical = (
|
| 113 |
+
p.get("source")
|
| 114 |
+
or (p.get("sources") or [None])[0]
|
| 115 |
+
or p.get("source_fr")
|
| 116 |
+
or (p.get("sources_fr") or [None])[0]
|
| 117 |
+
or ""
|
| 118 |
+
)
|
| 119 |
+
for alias in _expand_aliases(p):
|
| 120 |
+
candidates.append({
|
| 121 |
+
"source": canonical or alias,
|
| 122 |
+
"target": target,
|
| 123 |
+
"category": category,
|
| 124 |
+
"_alias": alias,
|
| 125 |
+
"_norm": _normalize(alias),
|
| 126 |
+
})
|
| 127 |
+
return candidates
|
| 128 |
|
| 129 |
|
| 130 |
def lookup(
|
|
|
|
| 196 |
scored.append((score, p))
|
| 197 |
scored.sort(key=lambda x: x[0], reverse=True)
|
| 198 |
out: list[dict] = []
|
| 199 |
+
seen: set[str] = set() # dedupe: aliases of the same row → one slot
|
| 200 |
+
for score, p in scored:
|
| 201 |
+
target = p.get("target") or ""
|
| 202 |
+
if target in seen:
|
| 203 |
+
continue
|
| 204 |
+
seen.add(target)
|
| 205 |
out.append({
|
| 206 |
"source": p.get("source"),
|
| 207 |
+
"target": target,
|
| 208 |
"category": p.get("category"),
|
| 209 |
"score": round(score, 3),
|
| 210 |
})
|
| 211 |
+
if len(out) >= k:
|
| 212 |
+
break
|
| 213 |
return out
|