Buckets:
bbkdevops/unicosys-hypergraph-bucket / tinymind-native-colab-handoff /bundle /scripts /deep_core_probe.py
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import logging | |
| import re | |
| import sys | |
| import time | |
| import warnings | |
| from pathlib import Path | |
| logging.getLogger("torch.utils.flop_counter").setLevel(logging.ERROR) | |
| warnings.filterwarnings( | |
| "ignore", | |
| message=r"_check_is_size will be removed in a future PyTorch release.*", | |
| category=FutureWarning, | |
| module=r"bitsandbytes\.backends\.cuda\.ops", | |
| ) | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| PROJECT_ROOT = Path(r"D:\ad\tinymind") | |
| MODEL_ROOT = PROJECT_ROOT / "model" / "tinymind-12b" | |
| sys.path.insert(0, str(MODEL_ROOT)) | |
| from tinymind_text_sanitize import sanitize_generated_text | |
| PROBES = [ | |
| { | |
| "id": "language_semantics", | |
| "prompt": "อธิบายความต่างระหว่าง ambiguity, vagueness และ uncertainty เป็นภาษาไทย พร้อมตัวอย่างสั้น ๆ", | |
| "must": ["ambiguity", "vagueness", "uncertainty"], | |
| }, | |
| { | |
| "id": "raw_code_bits", | |
| "prompt": "Explain how to sign-extend a packed signed 6-bit integer and name two boundary values.", | |
| "must": ["mask", "sign", "-32", "31"], | |
| }, | |
| { | |
| "id": "systems_abi", | |
| "prompt": "Explain ABI compatibility for a Rust/C FFI boundary in one concise paragraph.", | |
| "must": ["calling", "layout", "symbol"], | |
| }, | |
| { | |
| "id": "pure_math_bound", | |
| "prompt": "พิสูจน์สั้น ๆ ว่า m_t = c m_{t-1} + x_t มีขอบเขตเมื่อ 0<c<1 และ |x_t|<=B", | |
| "must": ["B", "1-c", "ขอบเขต"], | |
| }, | |
| { | |
| "id": "entropy_relation", | |
| "prompt": "Explain the relation H(P,Q)=H(P)+KL(P||Q) and why it matters for eval loss.", | |
| "must": ["cross", "KL", "entropy"], | |
| }, | |
| ] | |
| def load_model(model_id: str, adapter: str): | |
| tokenizer = AutoTokenizer.from_pretrained(adapter if Path(adapter).exists() else model_id, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| bnb = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| bnb_4bit_use_double_quant=True, | |
| ) | |
| base = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| quantization_config=bnb, | |
| device_map="auto", | |
| trust_remote_code=True, | |
| dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, | |
| ) | |
| model = PeftModel.from_pretrained(base, adapter) | |
| model.eval() | |
| return tokenizer, model | |
| def repeated_ngrams(text: str) -> bool: | |
| words = re.findall(r"\w+", text.lower(), flags=re.UNICODE) | |
| grams = [" ".join(words[i : i + 5]) for i in range(max(0, len(words) - 4))] | |
| return len(grams) != len(set(grams)) | |
| def generate(tokenizer, model, prompt: str) -> str: | |
| messages = [ | |
| {"role": "system", "content": "Answer precisely. Use constraints from the user. Avoid repetition."}, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| inputs = tokenizer(text, return_tensors="pt").to(model.device) | |
| with torch.no_grad(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=220, | |
| min_new_tokens=24, | |
| do_sample=False, | |
| repetition_penalty=1.16, | |
| no_repeat_ngram_size=5, | |
| pad_token_id=tokenizer.eos_token_id, | |
| eos_token_id=tokenizer.eos_token_id, | |
| ) | |
| return sanitize_generated_text(tokenizer.decode(out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True).strip()) | |
| def score(sample: dict, response: str) -> tuple[int, list[str]]: | |
| flags: list[str] = [] | |
| lower = response.lower() | |
| missing = [term for term in sample["must"] if term.lower() not in lower] | |
| if missing: | |
| flags.append("missing:" + ",".join(missing)) | |
| if repeated_ngrams(response): | |
| flags.append("repetition") | |
| if len(response) < 80: | |
| flags.append("too_short") | |
| if "```" in response and response.count("```") % 2 != 0: | |
| flags.append("broken_code_fence") | |
| points = 4 - len(flags) | |
| if any(flag.startswith("missing:") for flag in flags): | |
| points -= 1 | |
| return max(points, 0), flags | |
| def main() -> int: | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model-id", default="mistralai/Mistral-Nemo-Instruct-2407") | |
| parser.add_argument("--adapter", action="append", required=True, help="name=path") | |
| parser.add_argument("--out-dir", default=str(PROJECT_ROOT / "reports" / "deep_core_probe_manual")) | |
| args = parser.parse_args() | |
| report = {"created_at": time.time(), "base_model": args.model_id, "probes": PROBES, "adapters": {}} | |
| for item in args.adapter: | |
| name, adapter = item.split("=", 1) | |
| tokenizer, model = load_model(args.model_id, adapter) | |
| samples = [] | |
| total = 0 | |
| for probe in PROBES: | |
| response = generate(tokenizer, model, probe["prompt"]) | |
| points, flags = score(probe, response) | |
| total += points | |
| samples.append({"id": probe["id"], "response": response, "score": points, "flags": flags}) | |
| print(f"{name} {probe['id']} score={points} flags={flags}") | |
| report["adapters"][name] = { | |
| "adapter": adapter, | |
| "total_score": total, | |
| "max_score": len(PROBES) * 4, | |
| "samples": samples, | |
| } | |
| del model | |
| torch.cuda.empty_cache() | |
| out_dir = Path(args.out_dir) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| out_path = out_dir / "deep_core_probe_report.json" | |
| out_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8") | |
| print(json.dumps({"report": str(out_path), "adapters": report["adapters"]}, ensure_ascii=False, indent=2)) | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |
Xet Storage Details
- Size:
- 6.11 kB
- Xet hash:
- 67082277eb83925f6b5a677f538b51bc9def47d3777e66e7ff58aef7fb0ffe8e
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.