MSG msgencrypted-auto commited on
Commit
bbff1ca
·
1 Parent(s): 28543d3

Feat/last hour (#24)

Browse files

* eval experiment and profiles

* experiments

* french language

* french language

* evals

* evals

---------

Co-authored-by: msgencrypted-auto <msgencrypted.auto@gmail.com>

research/data/build_language_lesson_chat.py CHANGED
@@ -43,9 +43,11 @@ MAX_ASSISTANT_CHARS = 600
43
  EVAL_HOLDOUT_RATIO = 0.05
44
 
45
  DEFAULT_FR_SOURCES = (
 
46
  "angeluriot/french_instruct",
47
  "CohereLabs/aya_dataset",
48
  "pinzhenchen/alpaca-cleaned-fr",
 
49
  )
50
  DEFAULT_AR_SOURCES = (
51
  "arbml/CIDAR",
@@ -54,9 +56,11 @@ DEFAULT_AR_SOURCES = (
54
  )
55
 
56
  SOURCE_CAPS: dict[str, dict[str, int]] = {
 
57
  "angeluriot/french_instruct": {"fr": 8000},
58
  "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
59
  "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
 
60
  "arbml/CIDAR": {"ar": 8000},
61
  "ClusterlabAi/InstAr-500k": {"ar": 5000},
62
  }
@@ -138,6 +142,23 @@ def _load_seeds(path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]
138
  return fr_rows, ar_rows
139
 
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
142
  from datasets import load_dataset
143
 
@@ -197,6 +218,25 @@ def _iter_alpaca_fr(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
197
  break
198
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
201
  from datasets import load_dataset
202
 
@@ -235,12 +275,14 @@ def _iter_instar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
235
 
236
 
237
  _SOURCE_LOADERS: dict[str, dict[str, Any]] = {
 
238
  "angeluriot/french_instruct": {"fr": _iter_french_instruct},
239
  "CohereLabs/aya_dataset": {
240
  "fr": lambda n: _iter_aya("fra", n),
241
  "ar": lambda n: _iter_aya("arb", n),
242
  },
243
  "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
 
244
  "arbml/CIDAR": {"ar": _iter_cidar},
245
  "ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
246
  }
 
43
  EVAL_HOLDOUT_RATIO = 0.05
44
 
45
  DEFAULT_FR_SOURCES = (
46
+ "FrancophonIA/english_french",
47
  "angeluriot/french_instruct",
48
  "CohereLabs/aya_dataset",
49
  "pinzhenchen/alpaca-cleaned-fr",
50
+ "jpacifico/French-Alpaca-dataset-Instruct-110K",
51
  )
52
  DEFAULT_AR_SOURCES = (
53
  "arbml/CIDAR",
 
56
  )
57
 
58
  SOURCE_CAPS: dict[str, dict[str, int]] = {
59
+ "FrancophonIA/english_french": {"fr": 4000},
60
  "angeluriot/french_instruct": {"fr": 8000},
61
  "CohereLabs/aya_dataset": {"fr": 3000, "ar": 3000},
62
  "pinzhenchen/alpaca-cleaned-fr": {"fr": 2000},
63
+ "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": 4000},
64
  "arbml/CIDAR": {"ar": 8000},
65
  "ClusterlabAi/InstAr-500k": {"ar": 5000},
66
  }
 
142
  return fr_rows, ar_rows
143
 
144
 
145
+ def _iter_english_french(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
146
+ """EN→FR parallel sentences — user asks in English, coach replies in French."""
147
+ from datasets import load_dataset
148
+
149
+ ds = load_dataset("FrancophonIA/english_french", split="train", streaming=True)
150
+ count = 0
151
+ for row in ds:
152
+ english = (row.get("english") or "").strip()
153
+ french = (row.get("french") or "").strip()
154
+ if english and _assistant_ok(french):
155
+ user = f"Translate the following to French:\n{english}"
156
+ yield user, french, None
157
+ count += 1
158
+ if count >= max_rows:
159
+ break
160
+
161
+
162
  def _iter_french_instruct(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
163
  from datasets import load_dataset
164
 
 
218
  break
219
 
220
 
221
+ def _iter_french_alpaca_110k(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
222
+ from datasets import load_dataset
223
+
224
+ ds = load_dataset(
225
+ "jpacifico/French-Alpaca-dataset-Instruct-110K", split="train", streaming=True
226
+ )
227
+ count = 0
228
+ for row in ds:
229
+ instruction = (row.get("instruction") or "").strip()
230
+ inp = (row.get("input") or "").strip()
231
+ output = (row.get("output") or "").strip()
232
+ user_text = f"{instruction}\n{inp}".strip() if inp else instruction
233
+ if user_text and _assistant_ok(output):
234
+ yield user_text, output, None
235
+ count += 1
236
+ if count >= max_rows:
237
+ break
238
+
239
+
240
  def _iter_cidar(max_rows: int) -> Iterator[tuple[str, str, str | None]]:
241
  from datasets import load_dataset
242
 
 
275
 
276
 
277
  _SOURCE_LOADERS: dict[str, dict[str, Any]] = {
278
+ "FrancophonIA/english_french": {"fr": _iter_english_french},
279
  "angeluriot/french_instruct": {"fr": _iter_french_instruct},
280
  "CohereLabs/aya_dataset": {
281
  "fr": lambda n: _iter_aya("fra", n),
282
  "ar": lambda n: _iter_aya("arb", n),
283
  },
284
  "pinzhenchen/alpaca-cleaned-fr": {"fr": _iter_alpaca_fr},
285
+ "jpacifico/French-Alpaca-dataset-Instruct-110K": {"fr": _iter_french_alpaca_110k},
286
  "arbml/CIDAR": {"ar": _iter_cidar},
287
  "ClusterlabAi/InstAr-500k": {"ar": _iter_instar},
288
  }
research/evals/configs/eval_profiles.yaml CHANGED
@@ -72,6 +72,59 @@ profiles:
72
  tasks:
73
  - ifeval
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  general_slm:
76
  tool: slm-lm-eval
77
  claim: General ~1B SLM baseline
 
72
  tasks:
73
  - ifeval
74
 
75
+ medical:
76
+ tool: slm-lm-eval
77
+ claim: Better medical knowledge
78
+ description: Clinical Q&A — PubMedQA + MedMCQA + MedQA (USMLE) with arc guard.
79
+ config: lm_eval_medical.yaml
80
+ tasks:
81
+ - pubmedqa
82
+ - medmcqa
83
+ - medqa_4options
84
+ - arc_challenge
85
+
86
+ multilingual:
87
+ tool: slm-lm-eval
88
+ claim: Better multilingual understanding
89
+ description: Cross-lingual NLI / commonsense / coreference (XNLI, XCOPA, XWinograd).
90
+ config: lm_eval_multilingual.yaml
91
+ tasks:
92
+ - xnli
93
+ - xcopa
94
+ - xwinograd
95
+
96
+ commonsense:
97
+ tool: slm-lm-eval
98
+ claim: Better commonsense reasoning
99
+ description: Everyday-knowledge MCQ + coreference + physical commonsense.
100
+ config: lm_eval_commonsense.yaml
101
+ tasks:
102
+ - commonsense_qa
103
+ - winogrande
104
+ - piqa
105
+ - hellaswag
106
+
107
+ safety:
108
+ tool: slm-lm-eval
109
+ claim: More truthful, fewer imitative falsehoods
110
+ description: TruthfulQA MC2/MC1 (eval-only; do not train on the test set).
111
+ config: lm_eval_safety.yaml
112
+ tasks:
113
+ - truthfulqa_mc2
114
+ - truthfulqa_mc1
115
+ - arc_easy
116
+
117
+ french:
118
+ tool: slm-lm-eval
119
+ claim: Better French understanding and translation
120
+ description: Official FrenchBench MC tasks + WMT14 EN→FR (CroissantLLM benchmark suite).
121
+ config: lm_eval_french.yaml
122
+ tasks:
123
+ - french_bench_xnli
124
+ - belebele_fra_Latn
125
+ - french_bench_boolqa
126
+ - wmt14-en-fr
127
+
128
  general_slm:
129
  tool: slm-lm-eval
130
  claim: General ~1B SLM baseline
research/evals/configs/lm_eval_commonsense.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Commonsense profile — everyday reasoning, coreference, causal commonsense
2
+ # Run: slm-lm-eval --profile commonsense --preset minicpm5-1b --experiment-name commonsense-baseline
3
+
4
+ profile: commonsense
5
+ claim: Better commonsense reasoning
6
+
7
+ tasks:
8
+ - commonsense_qa # 5-way everyday-knowledge MCQ (gate task)
9
+ - winogrande # pronoun-resolution commonsense
10
+ - piqa # physical commonsense (general-capability guard)
11
+ - hellaswag # grounded commonsense guard
12
+
13
+ num_fewshot: 0
14
+ limit: 200
15
+ seed: 42
16
+ batch_size: auto
17
+ device: auto
18
+ dtype: bfloat16
19
+ trust_remote_code: true
20
+ output_dir: results/lm_eval
research/evals/configs/lm_eval_french.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # French profile — official FrenchBench (CroissantLLM) + EN→FR translation
2
+ # Pairs with french-lora (FrancophonIA/english_french). Run:
3
+ # slm-lm-eval --profile french --preset minicpm5-1b --experiment-name french-baseline
4
+
5
+ profile: french
6
+ claim: Better French understanding and translation
7
+
8
+ tasks:
9
+ - french_bench_xnli # French NLI (multiple choice; FrenchBench official)
10
+ - belebele_fra_Latn # French reading comprehension (FLORES-200 based)
11
+ - french_bench_boolqa # French boolean QA
12
+ - wmt14-en-fr # WMT14 English→French translation (BLEU)
13
+
14
+ num_fewshot: 0
15
+ limit: 100
16
+ seed: 42
17
+ batch_size: auto
18
+ device: auto
19
+ dtype: bfloat16
20
+ trust_remote_code: true
21
+ output_dir: results/lm_eval
research/evals/configs/lm_eval_medical.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Medical profile — clinical Q&A fact recall + reasoning
2
+ # Run: slm-lm-eval --profile medical --preset minicpm5-1b --experiment-name medical-baseline
3
+
4
+ profile: medical
5
+ claim: Better medical knowledge
6
+
7
+ tasks:
8
+ - pubmedqa # yes/no/maybe over biomedical abstracts (gate task)
9
+ - medmcqa # multi-subject medical entrance-exam MCQ
10
+ - medqa_4options # USMLE-style 4-option clinical MCQ
11
+ - arc_challenge # general-capability guard (catch regression from skill tuning)
12
+
13
+ num_fewshot: null # per-task canonical fewshot
14
+ limit: 200 # larger sample -> tighter stderr for gate decisions
15
+ seed: 42
16
+ batch_size: auto
17
+ device: auto
18
+ dtype: bfloat16
19
+ trust_remote_code: true
20
+ output_dir: results/lm_eval
research/evals/configs/lm_eval_multilingual.yaml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multilingual profile — cross-lingual NLI / commonsense / coreference
2
+ # Pairs with the FR/AR language-lesson adapter. Run:
3
+ # slm-lm-eval --profile multilingual --preset minicpm5-1b --experiment-name multilingual-baseline
4
+
5
+ profile: multilingual
6
+ claim: Better multilingual understanding
7
+
8
+ tasks:
9
+ - xnli # cross-lingual natural-language inference (15 langs incl. fr/ar)
10
+ - xcopa # cross-lingual causal commonsense
11
+ - xwinograd # cross-lingual coreference (Winograd schema)
12
+
13
+ num_fewshot: 0
14
+ limit: 100
15
+ seed: 42
16
+ batch_size: auto
17
+ device: auto
18
+ dtype: bfloat16
19
+ trust_remote_code: true
20
+ output_dir: results/lm_eval
research/evals/configs/lm_eval_safety.yaml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Safety / truthfulness profile — resist imitative falsehoods
2
+ # EVAL-ONLY: do NOT fine-tune on TruthfulQA (it is the test set — contamination).
3
+ # Improve it indirectly via high-quality helpful/honest instruction data, then
4
+ # measure here. Run:
5
+ # slm-lm-eval --profile safety --preset minicpm5-1b --experiment-name safety-baseline
6
+
7
+ profile: safety
8
+ claim: More truthful, fewer imitative falsehoods
9
+
10
+ tasks:
11
+ - truthfulqa_mc2 # multi-true MC truthfulness (primary)
12
+ - truthfulqa_mc1 # single-true MC truthfulness
13
+ - arc_easy # general-capability guard
14
+
15
+ num_fewshot: 0
16
+ limit: 200
17
+ seed: 42
18
+ batch_size: auto
19
+ device: auto
20
+ dtype: bfloat16
21
+ trust_remote_code: true
22
+ output_dir: results/lm_eval
research/evals/docs/eval_profiles.md CHANGED
@@ -51,6 +51,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
51
  | Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
52
  | Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
53
  | Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
 
 
54
  | General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
55
  | Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
56
  | Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
@@ -72,6 +74,8 @@ Use **one profile per claim**. Do not compare training loss to lm-eval accuracy.
72
  | `understanding` | `lm_eval_understanding.yaml` |
73
  | `code` | `lm_eval_code.yaml` |
74
  | `instructions` | `lm_eval_instructions.yaml` |
 
 
75
  | `general_slm` | `lm_eval_minicpm5.yaml` |
76
  | `compare_study` | `lm_eval_compare_study.yaml` |
77
 
 
51
  | Better language understanding | `understanding` | `slm-lm-eval` | `boolq`, `piqa`, `copa`, `rte` |
52
  | Better code generation | `code` | `slm-lm-eval` | `humaneval`, `mbpp` |
53
  | Better instruction following | `instructions` | `slm-lm-eval` | `ifeval` |
54
+ | Better French / translation | `french` | `slm-lm-eval` | `french_bench_xnli`, `belebele_fra_Latn`, `wmt14-en-fr`, … |
55
+ | Better multilingual understanding | `multilingual` | `slm-lm-eval` | `xnli`, `xcopa`, `xwinograd` |
56
  | General ~1B SLM baseline | `general_slm` | `slm-lm-eval` | 6-task mix (full splits) |
57
  | Baseline vs finetune study | `compare_study` | `slm-lm-eval` | Same 6 tasks, limit 100 |
58
  | Tool use / function calling | `agentic_tool_use` | `slm-benchmark` | `bfcl`, `tau_bench` |
 
74
  | `understanding` | `lm_eval_understanding.yaml` |
75
  | `code` | `lm_eval_code.yaml` |
76
  | `instructions` | `lm_eval_instructions.yaml` |
77
+ | `french` | `lm_eval_french.yaml` |
78
+ | `multilingual` | `lm_eval_multilingual.yaml` |
79
  | `general_slm` | `lm_eval_minicpm5.yaml` |
80
  | `compare_study` | `lm_eval_compare_study.yaml` |
81
 
research/finetune.py CHANGED
@@ -471,12 +471,13 @@ def save_training_results(
471
  return path
472
 
473
 
474
- def to_prompt_response(example, fmt, tokenizer, keys=None):
475
  """Normalize any supported format into a single training string,
476
  returning (full_text, prompt_text). prompt_text is None for raw text.
477
 
478
  `keys` optionally remaps a dataset's column names onto the format's
479
- expected fields (e.g. {"prompt": "query"} for MetaMathQA)."""
 
480
  keys = keys or {}
481
  if fmt == "text":
482
  return example[keys.get("text", "text")], None
@@ -491,6 +492,8 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
491
 
492
  elif fmt == "prompt":
493
  prompt = example.get(keys.get("prompt", "prompt"), "")
 
 
494
  rkey = keys.get("response")
495
  resp = example.get(rkey, "") if rkey else example.get(
496
  "completion", example.get("response", ""))
@@ -517,9 +520,10 @@ def to_prompt_response(example, fmt, tokenizer, keys=None):
517
  return full, prompt_only
518
 
519
 
520
- def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None):
521
  def fn(example):
522
- full, prompt = to_prompt_response(example, fmt, tokenizer, keys)
 
523
  ids = tokenizer(full, truncation=True, max_length=max_len,
524
  add_special_tokens=(fmt == "text"))["input_ids"]
525
  labels = list(ids)
@@ -593,7 +597,9 @@ def build_training_dataset(args, tokenizer):
593
  raw = raw.shuffle(seed=args.seed)
594
  keys = spec.get("columns") or {}
595
  max_len = spec.get("max_len", args.max_len)
596
- tokenize = build_tokenize_fn(tokenizer, fmt, max_len, args.mask_prompt, keys)
 
 
597
  tok = raw.map(tokenize, remove_columns=raw.column_names,
598
  desc=f"tokenizing {dataset}")
599
  tok = tok.filter(lambda e: len(e["input_ids"]) > 1)
 
471
  return path
472
 
473
 
474
+ def to_prompt_response(example, fmt, tokenizer, keys=None, prompt_prefix=None):
475
  """Normalize any supported format into a single training string,
476
  returning (full_text, prompt_text). prompt_text is None for raw text.
477
 
478
  `keys` optionally remaps a dataset's column names onto the format's
479
+ expected fields (e.g. {"prompt": "query"} for MetaMathQA).
480
+ `prompt_prefix` prepends fixed instruction text to prompt-format user turns."""
481
  keys = keys or {}
482
  if fmt == "text":
483
  return example[keys.get("text", "text")], None
 
492
 
493
  elif fmt == "prompt":
494
  prompt = example.get(keys.get("prompt", "prompt"), "")
495
+ if prompt_prefix:
496
+ prompt = f"{prompt_prefix}{prompt}"
497
  rkey = keys.get("response")
498
  resp = example.get(rkey, "") if rkey else example.get(
499
  "completion", example.get("response", ""))
 
520
  return full, prompt_only
521
 
522
 
523
+ def build_tokenize_fn(tokenizer, fmt, max_len, mask_prompt, keys=None, prompt_prefix=None):
524
  def fn(example):
525
+ full, prompt = to_prompt_response(
526
+ example, fmt, tokenizer, keys, prompt_prefix=prompt_prefix)
527
  ids = tokenizer(full, truncation=True, max_length=max_len,
528
  add_special_tokens=(fmt == "text"))["input_ids"]
529
  labels = list(ids)
 
597
  raw = raw.shuffle(seed=args.seed)
598
  keys = spec.get("columns") or {}
599
  max_len = spec.get("max_len", args.max_len)
600
+ prefix = spec.get("prompt_prefix")
601
+ tokenize = build_tokenize_fn(
602
+ tokenizer, fmt, max_len, args.mask_prompt, keys, prompt_prefix=prefix)
603
  tok = raw.map(tokenize, remove_columns=raw.column_names,
604
  desc=f"tokenizing {dataset}")
605
  tok = tok.filter(lambda e: len(e["input_ids"]) > 1)
research/modal/README.md CHANGED
@@ -101,6 +101,8 @@ QLoRA adapter per category, each evaluated against the matching
101
  | `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
102
  | `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
103
  | `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
 
 
104
  | `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
105
 
106
  Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`
 
101
  | `math-lora` | math | `TIGER-Lab/MathInstruct` (`alpaca`) | `math` | `gsm8k` (+ `arc_challenge` guard) | ✅ |
102
  | `coding-lora` | coding | `iamtarun/python_code_instructions_18k_alpaca` (`alpaca`) | `code` | `mbpp` | ✅ |
103
  | `reasoning-lora` | reasoning | `HuggingFaceTB/smoltalk` (`chat`) | `reasoning` | `gsm8k` (+ `hellaswag` guard) | ✅ |
104
+ | `language-lesson-lora` | language | `language-lesson-fr/ar.jsonl` (`chat`) | `multilingual` | `xnli` (+ `hellaswag` guard) | ✅ |
105
+ | `french-lora` | french | `FrancophonIA/english_french` (`prompt`) + FR chat | `french` | `french_bench_xnli` (+ `hellaswag` guard) | ✅ |
106
  | `alpaca-lora` | instructions | `tatsu-lab/alpaca` (`alpaca`) | `instructions` | — (no `goals`) | local-only |
107
 
108
  Before publishing, replace `defaults.hub_org` and each job's `publish.hub_repo`
research/modal/experiments.yaml CHANGED
@@ -81,28 +81,63 @@ finetune:
81
  - build-small-hackathon/minicpm5-1b-teaching-lora
82
  private: false
83
 
84
- # --- science: factual + explanatory science tutoring ---
85
- # 10 local science-tutor chats overfit easily on their own; mix in alpaca
86
- # replay + NEFTune + bigger LoRA rank (same recipe as teaching/math/language)
87
- # so sciq recall improves instead of regressing from narrow chat-format drift.
 
 
 
 
 
88
  - name: science-lora
89
  category: science
90
- max_steps: 150
91
  mix:
92
- - dataset: research/data/science-tutor-chat.jsonl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  format: chat
94
- weight: 16 # ~10 samples -> ~160 examples
95
- - dataset: tatsu-lab/alpaca # general replay: protect sciq/arc_challenge
 
 
 
 
 
 
 
96
  format: alpaca
97
- dataset_split: "train[:600]"
98
- max_samples: 600
99
  args:
100
- lora_r: 32
101
- lora_alpha: 64
102
- neftune_noise_alpha: 5
103
- early_stopping_patience: 2 # keep best eval_loss checkpoint, not the last
104
  val_split: 0.05
105
- description: Science tutor Q&A chat + alpaca replay, r=32 + NEFTune
 
 
106
  eval_profile: science
107
  goals:
108
  task: sciq
@@ -214,6 +249,127 @@ finetune:
214
  - build-small-hackathon/minicpm5-1b-reasoning-lora
215
  private: false
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  # --- general instructions baseline: no goals/publish -> local-only adapter ---
218
  - name: alpaca-lora
219
  category: instructions
@@ -252,9 +408,9 @@ finetune:
252
  description: >
253
  FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
254
  build_language_lesson_chat.py) + English replay
255
- eval_profile: understanding
256
  goals:
257
- task: boolq
258
  min_improve: 0.0
259
  guard_tasks:
260
  - task: hellaswag
@@ -264,3 +420,47 @@ finetune:
264
  mirror_repos:
265
  - build-small-hackathon/minicpm5-1b-language-lesson-lora
266
  private: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  - build-small-hackathon/minicpm5-1b-teaching-lora
82
  private: false
83
 
84
+ # --- science: MC-format science Q&A (sciq/ARC/OpenBookQA train) ---
85
+ # Previous attempt used chat-format tutoring wrong signal for MC benchmarks.
86
+ # Model already scores 0.935 sciq; needs in-distribution MC Q→A to push higher.
87
+ # allenai/sciq train: 11k factual science MC (question→correct_answer).
88
+ # allenai/ai2_arc ARC-Easy train: elementary/science MC, boosts arc_* guards.
89
+ # allenai/openbookqa train: fact-based science Q&A, improves openbookqa eval.
90
+ # Local science-tutor-chat kept at low weight for style/explanation diversity.
91
+ # MetaMathQA slice protects gsm8k guard (prevented 0.14 regression last run).
92
+ # Reduced to r=16, no NEFTune: less catastrophic forgetting on small datasets.
93
  - name: science-lora
94
  category: science
95
+ max_steps: 120
96
  mix:
97
+ - dataset: allenai/sciq # 11k MC science Q→A (in-distribution with sciq eval)
98
+ format: prompt
99
+ columns:
100
+ prompt: question
101
+ response: correct_answer
102
+ dataset_split: "train[:1500]"
103
+ max_samples: 1500
104
+ - dataset: allenai/ai2_arc # elementary + challenge science MC
105
+ format: prompt
106
+ dataset_config: ARC-Easy
107
+ columns:
108
+ prompt: question
109
+ response: answerKey
110
+ dataset_split: "train[:500]"
111
+ max_samples: 500
112
+ - dataset: allenai/openbookqa # fact-based open science Q&A
113
+ format: prompt
114
+ columns:
115
+ prompt: question_stem
116
+ response: answerKey
117
+ dataset_split: "train[:400]"
118
+ max_samples: 400
119
+ - dataset: research/data/science-tutor-chat.jsonl # style diversity
120
  format: chat
121
+ weight: 4
122
+ - dataset: meta-math/MetaMathQA # gsm8k guard protection
123
+ format: prompt
124
+ columns:
125
+ prompt: query
126
+ response: response
127
+ dataset_split: "train[:200]"
128
+ max_samples: 200
129
+ - dataset: tatsu-lab/alpaca # general replay: protect hellaswag/piqa/boolq
130
  format: alpaca
131
+ dataset_split: "train[:400]"
132
+ max_samples: 400
133
  args:
134
+ lora_r: 16
135
+ lora_alpha: 32
136
+ early_stopping_patience: 3
 
137
  val_split: 0.05
138
+ description: >
139
+ sciq + ARC-Easy + OpenBookQA MC train + science-chat style + MetaMathQA
140
+ guard + alpaca replay. r=16, no NEFTune to avoid gsm8k regression.
141
  eval_profile: science
142
  goals:
143
  task: sciq
 
249
  - build-small-hackathon/minicpm5-1b-reasoning-lora
250
  private: false
251
 
252
+ # --- medical: clinical Q&A (MedQA/Meadow) + alpaca replay ---
253
+ # New vertical. Same overfit-guard recipe as teaching/science: a focused
254
+ # skill dataset up-weighted, alpaca replay + NEFTune + r=32 so PubMedQA/MedMCQA
255
+ # improve without regressing the arc_challenge general-knowledge guard.
256
+ - name: medical-lora
257
+ category: medical
258
+ max_steps: 200
259
+ mix:
260
+ - dataset: medalpaca/medical_meadow_medqa # USMLE-style QA, alpaca columns
261
+ format: alpaca
262
+ dataset_split: "train[:2000]"
263
+ max_samples: 2000
264
+ - dataset: tatsu-lab/alpaca # general replay: protect guards
265
+ format: alpaca
266
+ dataset_split: "train[:600]"
267
+ max_samples: 600
268
+ args:
269
+ lora_r: 32
270
+ lora_alpha: 64
271
+ neftune_noise_alpha: 5
272
+ early_stopping_patience: 2
273
+ val_split: 0.05
274
+ description: Medical QA (medalpaca Meadow MedQA) + alpaca replay, r=32 + NEFTune
275
+ eval_profile: medical
276
+ goals:
277
+ task: pubmedqa
278
+ min_score: 0.45
279
+ min_improve: 0.02
280
+ guard_tasks:
281
+ - task: arc_challenge
282
+ max_regress: 0.03
283
+ publish:
284
+ hub_repo: MSGEncrypted/minicpm5-1b-medical-lora
285
+ mirror_repos:
286
+ - build-small-hackathon/minicpm5-1b-medical-lora
287
+ private: false
288
+
289
+ # --- tool-use: function/tool-calling (xLAM) ---
290
+ # New vertical that closes the loop with the existing BFCL agentic benchmark.
291
+ # The publish gate guards general ability (lm-eval has no function-call task);
292
+ # the *skill* metric is the BFCL/tau-bench suite run via slm-benchmark:
293
+ # uv run --package slm-evals slm-benchmark --model <adapter> --benchmarks bfcl --max-samples 50
294
+ - name: tool-use-lora
295
+ category: tool_use
296
+ max_steps: 200
297
+ mix:
298
+ - dataset: Salesforce/xlam-function-calling-60k
299
+ format: prompt
300
+ columns:
301
+ prompt: query
302
+ response: answers # JSON function-call(s) the model must emit
303
+ dataset_split: "train[:3000]"
304
+ max_samples: 3000
305
+ - dataset: tatsu-lab/alpaca # general replay: protect guards
306
+ format: alpaca
307
+ dataset_split: "train[:600]"
308
+ max_samples: 600
309
+ args:
310
+ lora_r: 32
311
+ lora_alpha: 64
312
+ neftune_noise_alpha: 5
313
+ early_stopping_patience: 2
314
+ val_split: 0.05
315
+ description: >
316
+ Function/tool-calling (Salesforce xLAM) + alpaca replay. Skill metric is the
317
+ BFCL agentic suite (slm-benchmark); lm-eval gate only guards general ability.
318
+ eval_profile: compare_study
319
+ goals:
320
+ task: arc_easy
321
+ min_improve: 0.0
322
+ guard_tasks:
323
+ - task: hellaswag
324
+ max_regress: 0.03
325
+ - task: piqa
326
+ max_regress: 0.03
327
+ publish:
328
+ hub_repo: MSGEncrypted/minicpm5-1b-tool-use-lora
329
+ mirror_repos:
330
+ - build-small-hackathon/minicpm5-1b-tool-use-lora
331
+ private: false
332
+
333
+ # --- commonsense: everyday-reasoning MCQ (CommonsenseQA train) ---
334
+ # New vertical. In-distribution MC train (question -> answerKey), same recipe
335
+ # as science-lora's ARC/OpenBookQA slices, with alpaca + winogrande-style guards.
336
+ - name: commonsense-lora
337
+ category: commonsense
338
+ max_steps: 150
339
+ mix:
340
+ - dataset: tau/commonsense_qa # 5-way everyday-knowledge MCQ, in-distribution
341
+ format: prompt
342
+ columns:
343
+ prompt: question
344
+ response: answerKey
345
+ dataset_split: "train[:2000]"
346
+ max_samples: 2000
347
+ - dataset: tatsu-lab/alpaca # general replay: protect piqa/hellaswag guards
348
+ format: alpaca
349
+ dataset_split: "train[:600]"
350
+ max_samples: 600
351
+ args:
352
+ lora_r: 16
353
+ lora_alpha: 32
354
+ early_stopping_patience: 2
355
+ val_split: 0.05
356
+ description: CommonsenseQA MC train + alpaca replay, r=16
357
+ eval_profile: commonsense
358
+ goals:
359
+ task: commonsense_qa
360
+ min_score: 0.30
361
+ min_improve: 0.02
362
+ guard_tasks:
363
+ - task: piqa
364
+ max_regress: 0.03
365
+ - task: hellaswag
366
+ max_regress: 0.03
367
+ publish:
368
+ hub_repo: MSGEncrypted/minicpm5-1b-commonsense-lora
369
+ mirror_repos:
370
+ - build-small-hackathon/minicpm5-1b-commonsense-lora
371
+ private: false
372
+
373
  # --- general instructions baseline: no goals/publish -> local-only adapter ---
374
  - name: alpaca-lora
375
  category: instructions
 
408
  description: >
409
  FR/AR TeacherVoice LoRA from language-lesson-fr/ar.jsonl (Hub-built via
410
  build_language_lesson_chat.py) + English replay
411
+ eval_profile: multilingual
412
  goals:
413
+ task: xnli
414
  min_improve: 0.0
415
  guard_tasks:
416
  - task: hellaswag
 
420
  mirror_repos:
421
  - build-small-hackathon/minicpm5-1b-language-lesson-lora
422
  private: false
423
+
424
+ # --- french: EN→FR translation (FrancophonIA/english_french) + FrenchBench gate ---
425
+ # 320k parallel sentences from Kaggle englishfrench-fornmt (Hub: FrancophonIA/english_french).
426
+ # FrenchBench (CroissantLLM) is the official French lm-eval suite; gate on french_bench_xnli.
427
+ - name: french-lora
428
+ category: french
429
+ max_steps: 150
430
+ mix:
431
+ - dataset: FrancophonIA/english_french
432
+ format: prompt
433
+ columns:
434
+ prompt: english
435
+ response: french
436
+ prompt_prefix: "Translate the following English sentence to French:\n"
437
+ dataset_split: "train[:3000]"
438
+ max_samples: 3000
439
+ - dataset: research/data/language-lesson-fr.jsonl
440
+ format: chat
441
+ weight: 6
442
+ - dataset: tatsu-lab/alpaca
443
+ format: alpaca
444
+ dataset_split: "train[:400]"
445
+ max_samples: 400
446
+ args:
447
+ lora_r: 32
448
+ lora_alpha: 64
449
+ neftune_noise_alpha: 5
450
+ early_stopping_patience: 2
451
+ val_split: 0.05
452
+ description: >
453
+ EN→FR translation (FrancophonIA/english_french) + TeacherVoice FR chat +
454
+ alpaca replay. Evaluated on FrenchBench (french_bench_xnli, belebele_fra_Latn).
455
+ eval_profile: french
456
+ goals:
457
+ task: french_bench_xnli
458
+ min_improve: 0.01
459
+ guard_tasks:
460
+ - task: hellaswag
461
+ max_regress: 0.03
462
+ publish:
463
+ hub_repo: MSGEncrypted/minicpm5-1b-french-lora
464
+ mirror_repos:
465
+ - build-small-hackathon/minicpm5-1b-french-lora
466
+ private: false