Spaces:

build-small-hackathon
/

lesson-agent

Running on Zero

MSG msgencrypted-auto commited on 15 days ago

Commit

bbff1ca

1 Parent(s): 28543d3

Feat/last hour (#24)

* eval experiment and profiles

* experiments

* french language

* french language

* evals

* evals

---------

Co-authored-by: msgencrypted-auto <msgencrypted.auto@gmail.com>

Files changed (11) hide show

research/data/build_language_lesson_chat.py +42 -0
research/evals/configs/eval_profiles.yaml +53 -0
research/evals/configs/lm_eval_commonsense.yaml +20 -0
research/evals/configs/lm_eval_french.yaml +21 -0
research/evals/configs/lm_eval_medical.yaml +20 -0
research/evals/configs/lm_eval_multilingual.yaml +20 -0
research/evals/configs/lm_eval_safety.yaml +22 -0
research/evals/docs/eval_profiles.md +4 -0
research/finetune.py +11 -5
research/modal/README.md +2 -0
research/modal/experiments.yaml +217 -17

research/data/build_language_lesson_chat.py CHANGED Viewed

@@ -43,9 +43,11 @@ MAX_ASSISTANT_CHARS = 600
 EVAL_HOLDOUT_RATIO = 0.05
 DEFAULT_FR_SOURCES = (
     "angeluriot/french_instruct",
     "CohereLabs/aya_dataset",
     "pinzhenchen/alpaca-cleaned-fr",
 )
 DEFAULT_AR_SOURCES = (
     "arbml/CIDAR",
@@ -54,9 +56,11 @@ DEFAULT_AR_SOURCES = (
 )
 SOURCE_CAPS: dict[str, dict[str, int]] = {
     "angeluriot/french_instruct": {"fr": 8000},
     "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
     "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
     "arbml/CIDAR": {"ar": 8000},
     "ClusterlabAi/InstAr-500k": {"ar": 5000},
 }
@@ -138,6 +142,23 @@ def _load_seeds(path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]
     return fr_rows, ar_rows
 def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
     from datasets import load_dataset
@@ -197,6 +218,25 @@ def _iter_alpaca_fr(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
                 break
 def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
     from datasets import load_dataset
@@ -235,12 +275,14 @@ def _iter_instar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
 _SOURCE_LOADERS: dict[str, dict[str, Any]] = {
     "angeluriot/french_instruct": {"fr": _iter_french_instruct},
     "CohereLabs/aya_dataset": {
         "fr": lambda n: _iter_aya("fra", n),
         "ar": lambda n: _iter_aya("arb", n),
     },
     "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
     "arbml/CIDAR": {"ar": _iter_cidar},
     "ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
 }

 EVAL_HOLDOUT_RATIO = 0.05
 DEFAULT_FR_SOURCES = (
+    "FrancophonIA/english_french",
     "angeluriot/french_instruct",
     "CohereLabs/aya_dataset",
     "pinzhenchen/alpaca-cleaned-fr",
+    "jpacifico/French-Alpaca-dataset-Instruct-110K",
 )
 DEFAULT_AR_SOURCES = (
     "arbml/CIDAR",
 )
 SOURCE_CAPS: dict[str, dict[str, int]] = {
+    "FrancophonIA/english_french": {"fr": 4000},
     "angeluriot/french_instruct": {"fr": 8000},
     "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
     "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
+    "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": 4000},
     "arbml/CIDAR": {"ar": 8000},
     "ClusterlabAi/InstAr-500k": {"ar": 5000},
 }
     return fr_rows, ar_rows
+def _iter_english_french(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
+    """EN→FR parallel sentences — user asks in English, coach replies in French."""
+    from datasets import load_dataset
+    ds = load_dataset("FrancophonIA/english_french", split="train", streaming=True)
+    count = 0
+    for row in ds:
+        english = (row.get("english") or "").strip()
+        french = (row.get("french") or "").strip()
+        if english and _assistant_ok(french):
+            user = f"Translate the following to French:\n{english}"
+            yield user, french, None
+            count += 1
+            if count >= max_rows:
+                break
 def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
     from datasets import load_dataset
                 break
+def _iter_french_alpaca_110k(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
+    from datasets import load_dataset
+    ds = load_dataset(
+        "jpacifico/French-Alpaca-dataset-Instruct-110K", split="train", streaming=True
+    )
+    count = 0
+    for row in ds:
+        instruction = (row.get("instruction") or "").strip()
+        inp = (row.get("input") or "").strip()
+        output = (row.get("output") or "").strip()
+        user_text = f"{instruction}\n{inp}".strip() if inp else instruction
+        if user_text and _assistant_ok(output):
+            yield user_text, output, None
+            count += 1
+            if count >= max_rows:
+                break
 def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
     from datasets import load_dataset
 _SOURCE_LOADERS: dict[str, dict[str, Any]] = {
+    "FrancophonIA/english_french": {"fr": _iter_english_french},
     "angeluriot/french_instruct": {"fr": _iter_french_instruct},
     "CohereLabs/aya_dataset": {
         "fr": lambda n: _iter_aya("fra", n),
         "ar": lambda n: _iter_aya("arb", n),
     },
     "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
+    "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": _iter_french_alpaca_110k},
     "arbml/CIDAR": {"ar": _iter_cidar},
     "ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
 }

research/evals/configs/eval_profiles.yaml CHANGED Viewed

@@ -72,6 +72,59 @@ profiles:
     tasks:
       - ifeval
   general_slm:
     tool: slm-lm-eval
     claim: General ~1B SLM baseline

     tasks:
       - ifeval
+  medical:
+    tool: slm-lm-eval
+    claim: Better medical knowledge
+    description: Clinical Q&A — PubMedQA + MedMCQA + MedQA (USMLE) with arc guard.
+    config: lm_eval_medical.yaml
+    tasks:
+      - pubmedqa
+      - medmcqa
+      - medqa_4options
+      - arc_challenge
+  multilingual:
+    tool: slm-lm-eval
+    claim: Better multilingual understanding
+    description: Cross-lingual NLI / commonsense / coreference (XNLI, XCOPA, XWinograd).
+    config: lm_eval_multilingual.yaml
+    tasks:
+      - xnli
+      - xcopa
+      - xwinograd
+  commonsense:
+    tool: slm-lm-eval
+    claim: Better commonsense reasoning
+    description: Everyday-knowledge MCQ + coreference + physical commonsense.
+    config: lm_eval_commonsense.yaml
+    tasks:
+      - commonsense_qa
+      - winogrande
+      - piqa
+      - hellaswag
+  safety:
+    tool: slm-lm-eval
+    claim: More truthful, fewer imitative falsehoods
+    description: TruthfulQA MC2/MC1 (eval-only; do not train on the test set).
+    config: lm_eval_safety.yaml
+    tasks:
+      - truthfulqa_mc2
+      - truthfulqa_mc1
+      - arc_easy
+  french:
+    tool: slm-lm-eval
+    claim: Better French understanding and translation
+    description: Official FrenchBench MC tasks + WMT14 EN→FR (CroissantLLM benchmark suite).
+    config: lm_eval_french.yaml
+    tasks:
+      - french_bench_xnli
+      - belebele_fra_Latn
+      - french_bench_boolqa
+      - wmt14-en-fr
   general_slm:
     tool: slm-lm-eval
     claim: General ~1B SLM baseline

research/evals/configs/lm_eval_commonsense.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# Commonsense profile — everyday reasoning, coreference, causal commonsense
+# Run: slm-lm-eval --profile commonsense --preset minicpm5-1b --experiment-name commonsense-baseline
+profile: commonsense
+claim: Better commonsense reasoning
+tasks:
+  - commonsense_qa  # 5-way everyday-knowledge MCQ (gate task)
+  - winogrande      # pronoun-resolution commonsense
+  - piqa            # physical commonsense (general-capability guard)
+  - hellaswag       # grounded commonsense guard
+num_fewshot: 0
+limit: 200
+seed: 42
+batch_size: auto
+device: auto
+dtype: bfloat16
+trust_remote_code: true
+output_dir: results/lm_eval

research/evals/configs/lm_eval_french.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+# French profile — official FrenchBench (CroissantLLM) + EN→FR translation
+# Pairs with french-lora (FrancophonIA/english_french). Run:
+#   slm-lm-eval --profile french --preset minicpm5-1b --experiment-name french-baseline
+profile: french
+claim: Better French understanding and translation
+tasks:
+  - french_bench_xnli      # French NLI (multiple choice; FrenchBench official)
+  - belebele_fra_Latn      # French reading comprehension (FLORES-200 based)
+  - french_bench_boolqa    # French boolean QA
+  - wmt14-en-fr            # WMT14 English→French translation (BLEU)
+num_fewshot: 0
+limit: 100
+seed: 42
+batch_size: auto
+device: auto
+dtype: bfloat16
+trust_remote_code: true
+output_dir: results/lm_eval

research/evals/configs/lm_eval_medical.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# Medical profile — clinical Q&A fact recall + reasoning
+# Run: slm-lm-eval --profile medical --preset minicpm5-1b --experiment-name medical-baseline
+profile: medical
+claim: Better medical knowledge
+tasks:
+  - pubmedqa        # yes/no/maybe over biomedical abstracts (gate task)
+  - medmcqa         # multi-subject medical entrance-exam MCQ
+  - medqa_4options  # USMLE-style 4-option clinical MCQ
+  - arc_challenge   # general-capability guard (catch regression from skill tuning)
+num_fewshot: null   # per-task canonical fewshot
+limit: 200          # larger sample -> tighter stderr for gate decisions
+seed: 42
+batch_size: auto
+device: auto
+dtype: bfloat16
+trust_remote_code: true
+output_dir: results/lm_eval

research/evals/configs/lm_eval_multilingual.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+# Multilingual profile — cross-lingual NLI / commonsense / coreference
+# Pairs with the FR/AR language-lesson adapter. Run:
+#   slm-lm-eval --profile multilingual --preset minicpm5-1b --experiment-name multilingual-baseline
+profile: multilingual
+claim: Better multilingual understanding
+tasks:
+  - xnli       # cross-lingual natural-language inference (15 langs incl. fr/ar)
+  - xcopa      # cross-lingual causal commonsense
+  - xwinograd  # cross-lingual coreference (Winograd schema)
+num_fewshot: 0
+limit: 100
+seed: 42
+batch_size: auto
+device: auto
+dtype: bfloat16
+trust_remote_code: true
+output_dir: results/lm_eval

research/evals/configs/lm_eval_safety.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+# Safety / truthfulness profile — resist imitative falsehoods
+# EVAL-ONLY: do NOT fine-tune on TruthfulQA (it is the test set — contamination).
+# Improve it indirectly via high-quality helpful/honest instruction data, then
+# measure here. Run:
+#   slm-lm-eval --profile safety --preset minicpm5-1b --experiment-name safety-baseline
+profile: safety
+claim: More truthful, fewer imitative falsehoods
+tasks:
+  - truthfulqa_mc2  # multi-true MC truthfulness (primary)
+  - truthfulqa_mc1  # single-true MC truthfulness
+  - arc_easy        # general-capability guard
+num_fewshot: 0
+limit: 200
+seed: 42
+batch_size: auto
+device: auto
+dtype: bfloat16
+trust_remote_code: true
+output_dir: results/lm_eval

research/evals/docs/eval_profiles.md CHANGED Viewed

@@ -51,6 +51,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
 | Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
 | Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
 | Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
 | General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
 | Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
 | Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
@@ -72,6 +74,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
 | `understanding` | `lm_eval_understanding.yaml` |
 | `code` | `lm_eval_code.yaml` |
 | `instructions` | `lm_eval_instructions.yaml` |
 | `general_slm` | `lm_eval_minicpm5.yaml` |
 | `compare_study` | `lm_eval_compare_study.yaml` |

 | Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
 | Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
 | Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
+| Better French / translation | `french` | `slm-lm-eval` | `french_bench_xnli`, `belebele_fra_Latn`, `wmt14-en-fr`, … |
+| Better multilingual understanding | `multilingual` | `slm-lm-eval` | `xnli`, `xcopa`, `xwinograd` |
 | General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
 | Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
 | Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
 | `understanding` | `lm_eval_understanding.yaml` |
 | `code` | `lm_eval_code.yaml` |
 | `instructions` | `lm_eval_instructions.yaml` |
+| `french` | `lm_eval_french.yaml` |
+| `multilingual` | `lm_eval_multilingual.yaml` |
 | `general_slm` | `lm_eval_minicpm5.yaml` |
 | `compare_study` | `lm_eval_compare_study.yaml` |

research/finetune.py CHANGED Viewed

@@ -471,12 +471,13 @@ def save_training_results(
     return path
-def to_prompt_response(example, fmt, tokenizer, keys=None):
     """Normalize any supported format into a single training string,
     returning (full_text, prompt_text). prompt_text is None for raw text.
     `keys` optionally remaps a dataset's column names onto the format's
-    expected fields (e.g. {"prompt": "query"} for MetaMathQA)."""
     keys = keys or {}
     if fmt == "text":
         return example[keys.get("text", "text")], None
@@ -491,6 +492,8 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
     elif fmt == "prompt":
         prompt = example.get(keys.get("prompt", "prompt"), "")
         rkey = keys.get("response")
         resp = example.get(rkey, "") if rkey else example.get(
             "completion", example.get("response", ""))
@@ -517,9 +520,10 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
     return full, prompt_only
-def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None):
     def fn(example):
-        full, prompt = to_prompt_response(example, fmt, tokenizer, keys)
         ids = tokenizer(full, truncation=True, max_length=max_len,
                         add_special_tokens=(fmt == "text"))["input_ids"]
         labels = list(ids)
@@ -593,7 +597,9 @@ def build_training_dataset(args, tokenizer):
         raw = raw.shuffle(seed=args.seed)
         keys = spec.get("columns") or {}
         max_len = spec.get("max_len", args.max_len)
-        tokenize = build_tokenize_fn(tokenizer, fmt, max_len, args.mask_prompt, keys)
         tok = raw.map(tokenize, remove_columns=raw.column_names,
                       desc=f"tokenizing {dataset}")
         tok = tok.filter(lambda e: len(e["input_ids"]) > 1)

     return path
+def to_prompt_response(example, fmt, tokenizer, keys=None, prompt_prefix=None):
     """Normalize any supported format into a single training string,
     returning (full_text, prompt_text). prompt_text is None for raw text.
     `keys` optionally remaps a dataset's column names onto the format's
+    expected fields (e.g. {"prompt": "query"} for MetaMathQA).
+    `prompt_prefix` prepends fixed instruction text to prompt-format user turns."""
     keys = keys or {}
     if fmt == "text":
         return example[keys.get("text", "text")], None
     elif fmt == "prompt":
         prompt = example.get(keys.get("prompt", "prompt"), "")
+        if prompt_prefix:
+            prompt = f"{prompt_prefix}{prompt}"
         rkey = keys.get("response")
         resp = example.get(rkey, "") if rkey else example.get(
             "completion", example.get("response", ""))
     return full, prompt_only
+def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None, prompt_prefix=None):
     def fn(example):
+        full, prompt = to_prompt_response(
+            example, fmt, tokenizer, keys, prompt_prefix=prompt_prefix)
         ids = tokenizer(full, truncation=True, max_length=max_len,
                         add_special_tokens=(fmt == "text"))["input_ids"]
         labels = list(ids)
         raw = raw.shuffle(seed=args.seed)
         keys = spec.get("columns") or {}
         max_len = spec.get("max_len", args.max_len)
+        prefix = spec.get("prompt_prefix")
+        tokenize = build_tokenize_fn(
+            tokenizer, fmt, max_len, args.mask_prompt, keys, prompt_prefix=prefix)
         tok = raw.map(tokenize, remove_columns=raw.column_names,
                       desc=f"tokenizing {dataset}")
         tok = tok.filter(lambda e: len(e["input_ids"]) > 1)

research/modal/README.md CHANGED Viewed

@@ -101,6 +101,8 @@ QLoRA adapter per category, each evaluated against the matching
 | `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
 | `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
 | `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
 | `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
 Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`

 | `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
 | `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
 | `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
+| `language-lesson-lora` | language | `language-lesson-fr/ar.jsonl` (`chat`) | `multilingual` | `xnli` (+ `hellaswag` guard) | ✅ |
+| `french-lora` | french | `FrancophonIA/english_french` (`prompt`) + FR chat | `french` | `french_bench_xnli` (+ `hellaswag` guard) | ✅ |
 | `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
 Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`

research/modal/experiments.yaml CHANGED Viewed

@@ -81,28 +81,63 @@ finetune:
         - build-small-hackathon/minicpm5-1b-teaching-lora
       private: false
-  # --- science: factual + explanatory science tutoring ---
-  # 10 local science-tutor chats overfit easily on their own; mix in alpaca
-  # replay + NEFTune + bigger LoRA rank (same recipe as teaching/math/language)
-  # so sciq recall improves instead of regressing from narrow chat-format drift.
   - name: science-lora
     category: science
-    max_steps: 150
     mix:
-      - dataset: research/data/science-tutor-chat.jsonl
         format: chat
-        weight: 16                    # ~10 samples -> ~160 examples
-      - dataset: tatsu-lab/alpaca      # general replay: protect sciq/arc_challenge
         format: alpaca
-        dataset_split: "train[:600]"
-        max_samples: 600
     args:
-      lora_r: 32
-      lora_alpha: 64
-      neftune_noise_alpha: 5
-      early_stopping_patience: 2   # keep best eval_loss checkpoint, not the last
       val_split: 0.05
-    description: Science tutor Q&A chat + alpaca replay, r=32 + NEFTune
     eval_profile: science
     goals:
       task: sciq
@@ -214,6 +249,127 @@ finetune:
         - build-small-hackathon/minicpm5-1b-reasoning-lora
       private: false
   # --- general instructions baseline: no goals/publish -> local-only adapter ---
   - name: alpaca-lora
     category: instructions
@@ -252,9 +408,9 @@ finetune:
     description: >
       FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
       build_language_lesson_chat.py) + English replay
-    eval_profile: understanding
     goals:
-      task: boolq
       min_improve: 0.0
       guard_tasks:
         - task: hellaswag
@@ -264,3 +420,47 @@ finetune:
       mirror_repos:
         - build-small-hackathon/minicpm5-1b-language-lesson-lora
       private: false

         - build-small-hackathon/minicpm5-1b-teaching-lora
       private: false
+  # --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
+  # Previous attempt used chat-format tutoring — wrong signal for MC benchmarks.
+  # Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
+  # allenai/sciq train: 11k factual science MC (question→correct_answer).
+  # allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
+  # allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
+  # Local science-tutor-chat kept at low weight for style/explanation diversity.
+  # MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
+  # Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
   - name: science-lora
     category: science
+    max_steps: 120
     mix:
+      - dataset: allenai/sciq           # 11k MC science Q→A (in-distribution with sciq eval)
+        format: prompt
+        columns:
+          prompt: question
+          response: correct_answer
+        dataset_split: "train[:1500]"
+        max_samples: 1500
+      - dataset: allenai/ai2_arc        # elementary + challenge science MC
+        format: prompt
+        dataset_config: ARC-Easy
+        columns:
+          prompt: question
+          response: answerKey
+        dataset_split: "train[:500]"
+        max_samples: 500
+      - dataset: allenai/openbookqa     # fact-based open science Q&A
+        format: prompt
+        columns:
+          prompt: question_stem
+          response: answerKey
+        dataset_split: "train[:400]"
+        max_samples: 400
+      - dataset: research/data/science-tutor-chat.jsonl   # style diversity
         format: chat
+        weight: 4
+      - dataset: meta-math/MetaMathQA   # gsm8k guard protection
+        format: prompt
+        columns:
+          prompt: query
+          response: response
+        dataset_split: "train[:200]"
+        max_samples: 200
+      - dataset: tatsu-lab/alpaca       # general replay: protect hellaswag/piqa/boolq
         format: alpaca
+        dataset_split: "train[:400]"
+        max_samples: 400
     args:
+      lora_r: 16
+      lora_alpha: 32
+      early_stopping_patience: 3
       val_split: 0.05
+    description: >
+      sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
+      guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
     eval_profile: science
     goals:
       task: sciq
         - build-small-hackathon/minicpm5-1b-reasoning-lora
       private: false
+  # --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
+  # New vertical. Same overfit-guard recipe as teaching/science: a focused
+  # skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
+  # improve without regressing the arc_challenge general-knowledge guard.
+  - name: medical-lora
+    category: medical
+    max_steps: 200
+    mix:
+      - dataset: medalpaca/medical_meadow_medqa   # USMLE-style QA, alpaca columns
+        format: alpaca
+        dataset_split: "train[:2000]"
+        max_samples: 2000
+      - dataset: tatsu-lab/alpaca                  # general replay: protect guards
+        format: alpaca
+        dataset_split: "train[:600]"
+        max_samples: 600
+    args:
+      lora_r: 32
+      lora_alpha: 64
+      neftune_noise_alpha: 5
+      early_stopping_patience: 2
+      val_split: 0.05
+    description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
+    eval_profile: medical
+    goals:
+      task: pubmedqa
+      min_score: 0.45
+      min_improve: 0.02
+      guard_tasks:
+        - task: arc_challenge
+          max_regress: 0.03
+    publish:
+      hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
+      mirror_repos:
+        - build-small-hackathon/minicpm5-1b-medical-lora
+      private: false
+  # --- tool-use: function/tool-calling (xLAM) ---
+  # New vertical that closes the loop with the existing BFCL agentic benchmark.
+  # The publish gate guards general ability (lm-eval has no function-call task);
+  # the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark:
+  #   uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
+  - name: tool-use-lora
+    category: tool_use
+    max_steps: 200
+    mix:
+      - dataset: Salesforce/xlam-function-calling-60k
+        format: prompt
+        columns:
+          prompt: query
+          response: answers           # JSON function-call(s) the model must emit
+        dataset_split: "train[:3000]"
+        max_samples: 3000
+      - dataset: tatsu-lab/alpaca       # general replay: protect guards
+        format: alpaca
+        dataset_split: "train[:600]"
+        max_samples: 600
+    args:
+      lora_r: 32
+      lora_alpha: 64
+      neftune_noise_alpha: 5
+      early_stopping_patience: 2
+      val_split: 0.05
+    description: >
+      Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
+      BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
+    eval_profile: compare_study
+    goals:
+      task: arc_easy
+      min_improve: 0.0
+      guard_tasks:
+        - task: hellaswag
+          max_regress: 0.03
+        - task: piqa
+          max_regress: 0.03
+    publish:
+      hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
+      mirror_repos:
+        - build-small-hackathon/minicpm5-1b-tool-use-lora
+      private: false
+  # --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
+  # New vertical. In-distribution MC train (question -> answerKey), same recipe
+  # as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
+  - name: commonsense-lora
+    category: commonsense
+    max_steps: 150
+    mix:
+      - dataset: tau/commonsense_qa     # 5-way everyday-knowledge MCQ, in-distribution
+        format: prompt
+        columns:
+          prompt: question
+          response: answerKey
+        dataset_split: "train[:2000]"
+        max_samples: 2000
+      - dataset: tatsu-lab/alpaca       # general replay: protect piqa/hellaswag guards
+        format: alpaca
+        dataset_split: "train[:600]"
+        max_samples: 600
+    args:
+      lora_r: 16
+      lora_alpha: 32
+      early_stopping_patience: 2
+      val_split: 0.05
+    description: CommonsenseQA MC train + alpaca replay, r=16
+    eval_profile: commonsense
+    goals:
+      task: commonsense_qa
+      min_score: 0.30
+      min_improve: 0.02
+      guard_tasks:
+        - task: piqa
+          max_regress: 0.03
+        - task: hellaswag
+          max_regress: 0.03
+    publish:
+      hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
+      mirror_repos:
+        - build-small-hackathon/minicpm5-1b-commonsense-lora
+      private: false
   # --- general instructions baseline: no goals/publish -> local-only adapter ---
   - name: alpaca-lora
     category: instructions
     description: >
       FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
       build_language_lesson_chat.py) + English replay
+    eval_profile: multilingual
     goals:
+      task: xnli
       min_improve: 0.0
       guard_tasks:
         - task: hellaswag
       mirror_repos:
         - build-small-hackathon/minicpm5-1b-language-lesson-lora
       private: false
+  # --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
+  # 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
+  # FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
+  - name: french-lora
+    category: french
+    max_steps: 150
+    mix:
+      - dataset: FrancophonIA/english_french
+        format: prompt
+        columns:
+          prompt: english
+          response: french
+        prompt_prefix: "Translate the following English sentence to French:\n"
+        dataset_split: "train[:3000]"
+        max_samples: 3000
+      - dataset: research/data/language-lesson-fr.jsonl
+        format: chat
+        weight: 6
+      - dataset: tatsu-lab/alpaca
+        format: alpaca
+        dataset_split: "train[:400]"
+        max_samples: 400
+    args:
+      lora_r: 32
+      lora_alpha: 64
+      neftune_noise_alpha: 5
+      early_stopping_patience: 2
+      val_split: 0.05
+    description: >
+      EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
+      alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
+    eval_profile: french
+    goals:
+      task: french_bench_xnli
+      min_improve: 0.01
+      guard_tasks:
+        - task: hellaswag
+          max_regress: 0.03
+    publish:
+      hub_repo: MSGEncrypted/minicpm5-1b-french-lora
+      mirror_repos:
+        - build-small-hackathon/minicpm5-1b-french-lora
+      private: false