jefffffff9 Claude Opus 4.7 commited on
Commit
5815492
·
1 Parent(s): 757e833

Phrasebook: multi-alias + French source keys, plus misses-script stub

Browse files

Two-track vocabulary enrichment that keeps the matcher and UI unchanged:

1. Multi-alias entries. Each curated row may now carry any combination of
`source`, `sources` (list), `source_fr`, `sources_fr` (list). The loader
flattens them into one match-candidate per alias, all pointing at the
same target translation. A typed paraphrase ("morning, sir") or French
equivalent ("bonjour") hits the same Bambara/Pular target as the
canonical English source. Existing single-`source` rows keep working
unchanged (verified: 110 rows → 110 candidates, exact match preserved).

2. top_k now dedupes by canonical target so multi-alias rows can't crowd
out the RAG few-shot slots.

3. scripts/phrasebook_misses.py — stub for the data-driven curation loop.
Reads data/field_turns.jsonl, groups translate-phase rows where
phrasebook=null, prints the most frequent unmet inputs. Filters by
--lang and --since. --draft (LLM-proposed target per miss) is a
marked TODO — wire it when we want a tighter loop.

The actual curation work (adding French keys / aliases to existing JSON
entries) is now data, not code.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show
  1. scripts/phrasebook_misses.py +125 -0
  2. src/llm/phrasebook.py +66 -8
scripts/phrasebook_misses.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Surface frequent phrasebook misses from field telemetry.
2
+
3
+ Reads `data/field_turns.jsonl` (or $FIELD_TURNS_PATH) and groups inputs
4
+ that produced `phrasebook=null` so you can decide which ones deserve a
5
+ curated entry. This is the data-driven side of vocabulary growth: instead
6
+ of guessing what to translate next, look at what real users typed.
7
+
8
+ Usage:
9
+ python scripts/phrasebook_misses.py # top-20 misses, all langs
10
+ python scripts/phrasebook_misses.py --lang ful # filter to one target lang
11
+ python scripts/phrasebook_misses.py --top 50 # show more
12
+ python scripts/phrasebook_misses.py --since 2026-04 # filter by ISO-prefix on ts
13
+ python scripts/phrasebook_misses.py --draft # also draft an LLM target
14
+ # (HF_TOKEN required)
15
+
16
+ Output is plain text — eyeball the list, decide what's worth adding, and
17
+ paste curated translations into:
18
+ configs/dialect_anchors/{bambara,pular}_phrasebook.json
19
+
20
+ Stub status: counting + grouping is fully wired. The --draft flag is a
21
+ TODO — it'll call MinimalClient on each miss and propose a target string
22
+ for you to review. Land that when we want a tighter loop.
23
+ """
24
+ from __future__ import annotations
25
+
26
+ import argparse
27
+ import json
28
+ import os
29
+ import sys
30
+ from collections import Counter
31
+ from pathlib import Path
32
+ from typing import Optional
33
+
34
+ # Default location matches TurnLogger's default.
35
+ _REPO_ROOT = Path(__file__).resolve().parent.parent
36
+ _DEFAULT_LOG = _REPO_ROOT / "data" / "field_turns.jsonl"
37
+
38
+
39
+ def _norm(text: str) -> str:
40
+ """Cheap normalisation for grouping. Same intent as phrasebook._normalize
41
+ but kept local so this script has no project import dependency."""
42
+ return " ".join((text or "").lower().strip().split())
43
+
44
+
45
+ def load_misses(
46
+ path: Path,
47
+ lang: Optional[str] = None,
48
+ since: Optional[str] = None,
49
+ ) -> Counter:
50
+ """Return Counter of normalised user inputs that produced phrasebook=null.
51
+
52
+ Filters:
53
+ lang — only count rows whose `output_lang` equals this (e.g. "ful").
54
+ since — only count rows whose `ts` starts with this string (ISO prefix).
55
+ """
56
+ if not path.exists():
57
+ print(f"No telemetry file at {path} — run the app to produce some turns first.",
58
+ file=sys.stderr)
59
+ return Counter()
60
+
61
+ counts: Counter = Counter()
62
+ with path.open("r", encoding="utf-8") as fh:
63
+ for line in fh:
64
+ line = line.strip()
65
+ if not line:
66
+ continue
67
+ try:
68
+ row = json.loads(line)
69
+ except json.JSONDecodeError:
70
+ continue
71
+ # Only translate-phase rows are useful — reply-phase rows always
72
+ # have phrasebook=null by construction.
73
+ if row.get("phase") not in (None, "translate"):
74
+ continue
75
+ if row.get("phrasebook") is not None:
76
+ continue
77
+ if lang and row.get("output_lang") != lang:
78
+ continue
79
+ if since and not (row.get("ts") or "").startswith(since):
80
+ continue
81
+ text = row.get("user_text") or row.get("transcript") or ""
82
+ text = _norm(text)
83
+ if text and not text.startswith("("): # skip our own sentinels
84
+ counts[text] += 1
85
+ return counts
86
+
87
+
88
+ def main() -> None:
89
+ ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
90
+ ap.add_argument("--path", default=os.environ.get("FIELD_TURNS_PATH", str(_DEFAULT_LOG)),
91
+ help="Path to field_turns.jsonl (default: data/field_turns.jsonl)")
92
+ ap.add_argument("--lang", choices=["bam", "ful", "fr", "en"], default=None,
93
+ help="Filter to one output language")
94
+ ap.add_argument("--since", default=None,
95
+ help='ISO-prefix filter on ts, e.g. "2026-04" or "2026-04-25"')
96
+ ap.add_argument("--top", type=int, default=20, help="How many misses to print")
97
+ ap.add_argument("--draft", action="store_true",
98
+ help="(TODO) Also draft an LLM-proposed target for each miss")
99
+ args = ap.parse_args()
100
+
101
+ counts = load_misses(Path(args.path), lang=args.lang, since=args.since)
102
+ if not counts:
103
+ print("No misses found with the current filters.")
104
+ return
105
+
106
+ total = sum(counts.values())
107
+ distinct = len(counts)
108
+ print(f"{total} miss-events across {distinct} distinct inputs"
109
+ + (f" (lang={args.lang})" if args.lang else "")
110
+ + (f" since {args.since}" if args.since else "")
111
+ + ".\n")
112
+ print(f"{'count':>5} input")
113
+ print(f"{'-----':>5} -----")
114
+ for text, n in counts.most_common(args.top):
115
+ print(f"{n:>5} {text}")
116
+
117
+ if args.draft:
118
+ # TODO: import MinimalClient and call .chat(text, target_lang=args.lang)
119
+ # for each top input, printing the proposed target alongside. Skipping
120
+ # now to keep the stub dependency-free.
121
+ print("\n[--draft is a stub — not yet implemented]")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()
src/llm/phrasebook.py CHANGED
@@ -11,9 +11,11 @@ Purpose
11
  Scope
12
  - Only fires when target language is bam or ful. For en/fr output we let
13
  the LLM (or a passthrough) handle it — nothing to short-circuit.
14
- - Only English source keys (what the curated sheets contain). French or
15
- in-language inputs will not match and will fall through to the LLM —
16
- that's correct behaviour.
 
 
17
 
18
  Matching
19
  - Exact match on normalised string → score 1.0 ("exact").
@@ -58,8 +60,38 @@ def _normalize(text: str) -> str:
58
  return text.strip()
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  @lru_cache(maxsize=4)
62
  def _load_phrasebook(lang: str) -> list[dict]:
 
 
 
 
 
 
 
 
63
  fname = _PHRASEBOOK_FILE.get(lang)
64
  if not fname:
65
  return []
@@ -70,10 +102,29 @@ def _load_phrasebook(lang: str) -> list[dict]:
70
  with path.open("r", encoding="utf-8") as f:
71
  data = json.load(f)
72
  pairs = data.get("pairs", [])
73
- # Precompute normalised source for speed.
 
74
  for p in pairs:
75
- p["_norm"] = _normalize(p.get("source", ""))
76
- return pairs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  def lookup(
@@ -145,11 +196,18 @@ def top_k(user_text: str, target_lang: str, k: int = 3) -> list[dict]:
145
  scored.append((score, p))
146
  scored.sort(key=lambda x: x[0], reverse=True)
147
  out: list[dict] = []
148
- for score, p in scored[:k]:
 
 
 
 
 
149
  out.append({
150
  "source": p.get("source"),
151
- "target": p.get("target"),
152
  "category": p.get("category"),
153
  "score": round(score, 3),
154
  })
 
 
155
  return out
 
11
  Scope
12
  - Only fires when target language is bam or ful. For en/fr output we let
13
  the LLM (or a passthrough) handle it — nothing to short-circuit.
14
+ - Source keys can be English and/or French, single or multi-alias. Each
15
+ curated row may carry any combination of `source`, `sources` (list),
16
+ `source_fr`, `sources_fr` (list); the loader flattens them into one
17
+ match-candidate per alias so a typed paraphrase or a French equivalent
18
+ hits the same target translation.
19
 
20
  Matching
21
  - Exact match on normalised string → score 1.0 ("exact").
 
60
  return text.strip()
61
 
62
 
63
+ def _expand_aliases(entry: dict) -> list[str]:
64
+ """Collect every source-alias on an entry across both languages.
65
+
66
+ Schema (all fields optional, additive — existing single-`source` rows
67
+ keep working unchanged):
68
+ source : "good morning" # canonical English
69
+ sources : ["morning", "morning!"] # English aliases / paraphrases
70
+ source_fr : "bonjour" # canonical French
71
+ sources_fr : ["salut", "bonjour à tous"]
72
+ """
73
+ out: list[str] = []
74
+ for key in ("source", "source_fr"):
75
+ v = entry.get(key)
76
+ if isinstance(v, str) and v.strip():
77
+ out.append(v)
78
+ for key in ("sources", "sources_fr"):
79
+ vs = entry.get(key)
80
+ if isinstance(vs, list):
81
+ out.extend(x for x in vs if isinstance(x, str) and x.strip())
82
+ return out
83
+
84
+
85
  @lru_cache(maxsize=4)
86
  def _load_phrasebook(lang: str) -> list[dict]:
87
+ """Load and flatten a phrasebook into one match-candidate per alias.
88
+
89
+ Each candidate carries the canonical source/target/category for display
90
+ and a precomputed normalised alias (`_norm`) for the matcher to compare
91
+ against. One curated row with N aliases produces N candidates that all
92
+ point at the same target translation — the matcher picks the closest
93
+ alias and returns the canonical entry.
94
+ """
95
  fname = _PHRASEBOOK_FILE.get(lang)
96
  if not fname:
97
  return []
 
102
  with path.open("r", encoding="utf-8") as f:
103
  data = json.load(f)
104
  pairs = data.get("pairs", [])
105
+
106
+ candidates: list[dict] = []
107
  for p in pairs:
108
+ target = p.get("target", "")
109
+ category = p.get("category")
110
+ # Canonical source for display: prefer English `source`, else first
111
+ # English `sources`, else French canonical, else first French alias.
112
+ canonical = (
113
+ p.get("source")
114
+ or (p.get("sources") or [None])[0]
115
+ or p.get("source_fr")
116
+ or (p.get("sources_fr") or [None])[0]
117
+ or ""
118
+ )
119
+ for alias in _expand_aliases(p):
120
+ candidates.append({
121
+ "source": canonical or alias,
122
+ "target": target,
123
+ "category": category,
124
+ "_alias": alias,
125
+ "_norm": _normalize(alias),
126
+ })
127
+ return candidates
128
 
129
 
130
  def lookup(
 
196
  scored.append((score, p))
197
  scored.sort(key=lambda x: x[0], reverse=True)
198
  out: list[dict] = []
199
+ seen: set[str] = set() # dedupe: aliases of the same row → one slot
200
+ for score, p in scored:
201
+ target = p.get("target") or ""
202
+ if target in seen:
203
+ continue
204
+ seen.add(target)
205
  out.append({
206
  "source": p.get("source"),
207
+ "target": target,
208
  "category": p.get("category"),
209
  "score": round(score, 3),
210
  })
211
+ if len(out) >= k:
212
+ break
213
  return out