| |
| """Fine-tune a recruiter-domain NER model from annotated CV data. |
| |
| Expected input formats: |
| - JSONL with tokens + ner_tags |
| - JSON with records containing `tokens` and `ner_tags` |
| |
| This script is intentionally defensive: if `transformers` is unavailable, |
| it still validates and exports a prepared dataset snapshot for later training. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| from typing import Any, Dict, List, Sequence, Tuple |
|
|
| try: |
| from datasets import Dataset |
| from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments |
|
|
| TRANSFORMERS_AVAILABLE = True |
| except Exception: |
| Dataset = None |
| AutoModelForTokenClassification = None |
| AutoTokenizer = None |
| Trainer = None |
| TrainingArguments = None |
| TRANSFORMERS_AVAILABLE = False |
|
|
| LABEL_NAMES = ["O", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-SKILL", "I-SKILL", "B-EDU", "I-EDU"] |
| DEFAULT_MODEL = "dslim/bert-base-NER" |
|
|
|
|
| def load_annotations(path: Path) -> List[Dict[str, Any]]: |
| """Load token/label annotations from JSON or JSONL.""" |
| if not path.exists(): |
| raise FileNotFoundError(f"Annotation file not found: {path}") |
|
|
| text = path.read_text(encoding="utf-8").strip() |
| if not text: |
| return [] |
|
|
| if path.suffix.lower() == ".jsonl": |
| records = [] |
| for line in text.splitlines(): |
| if line.strip(): |
| records.append(json.loads(line)) |
| return records |
|
|
| payload = json.loads(text) |
| if isinstance(payload, list): |
| return payload |
| if isinstance(payload, dict) and isinstance(payload.get("data"), list): |
| return payload["data"] |
| raise ValueError("Unsupported annotation format") |
|
|
|
|
| def validate_records(records: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: |
| """Keep only records with aligned tokens/labels.""" |
| valid_records: List[Dict[str, Any]] = [] |
| for record in records: |
| tokens = record.get("tokens") |
| ner_tags = record.get("ner_tags") |
| if not isinstance(tokens, list) or not isinstance(ner_tags, list): |
| continue |
| if len(tokens) != len(ner_tags) or not tokens: |
| continue |
| valid_records.append({"tokens": [str(token) for token in tokens], "ner_tags": [str(tag) for tag in ner_tags]}) |
| return valid_records |
|
|
|
|
| def build_label_map(records: Sequence[Dict[str, Any]]) -> Tuple[Dict[str, int], Dict[int, str]]: |
| labels = {"O"} |
| for record in records: |
| labels.update(str(tag) for tag in record.get("ner_tags", [])) |
| ordered_labels = ["O"] + sorted(label for label in labels if label != "O") |
| label2id = {label: index for index, label in enumerate(ordered_labels)} |
| id2label = {index: label for label, index in label2id.items()} |
| return label2id, id2label |
|
|
|
|
| def tokenize_and_align_labels(examples: Dict[str, List[Any]], tokenizer, label2id: Dict[str, int]) -> Dict[str, Any]: |
| tokenized_inputs = tokenizer( |
| examples["tokens"], |
| is_split_into_words=True, |
| truncation=True, |
| padding=False, |
| ) |
|
|
| aligned_labels = [] |
| for batch_index, labels in enumerate(examples["ner_tags"]): |
| word_ids = tokenized_inputs.word_ids(batch_index=batch_index) |
| previous_word_id = None |
| label_ids: List[int] = [] |
| for word_id in word_ids: |
| if word_id is None: |
| label_ids.append(-100) |
| elif word_id != previous_word_id: |
| label_ids.append(label2id.get(labels[word_id], label2id["O"])) |
| else: |
| label = labels[word_id] |
| if label.startswith("B-"): |
| label = "I-" + label[2:] |
| label_ids.append(label2id.get(label, label2id["O"])) |
| previous_word_id = word_id |
| aligned_labels.append(label_ids) |
|
|
| tokenized_inputs["labels"] = aligned_labels |
| return tokenized_inputs |
|
|
|
|
| def export_prepared_dataset(records: Sequence[Dict[str, Any]], output_dir: Path) -> Path: |
| output_dir.mkdir(parents=True, exist_ok=True) |
| export_path = output_dir / "recruiter_ner_prepared_dataset.json" |
| export_path.write_text(json.dumps(list(records), indent=2, ensure_ascii=False), encoding="utf-8") |
| return export_path |
|
|
|
|
| def train_model(records: Sequence[Dict[str, Any]], model_name: str, output_dir: Path, epochs: int) -> Path: |
| if not TRANSFORMERS_AVAILABLE: |
| raise RuntimeError("transformers/datasets are not available in this environment") |
|
|
| label2id, id2label = build_label_map(records) |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForTokenClassification.from_pretrained( |
| model_name, |
| num_labels=len(label2id), |
| label2id=label2id, |
| id2label=id2label, |
| ) |
|
|
| dataset = Dataset.from_list(list(records)) |
| tokenized_dataset = dataset.map(lambda batch: tokenize_and_align_labels(batch, tokenizer, label2id), batched=True) |
|
|
| output_dir.mkdir(parents=True, exist_ok=True) |
| training_args = TrainingArguments( |
| output_dir=str(output_dir), |
| learning_rate=2e-5, |
| per_device_train_batch_size=8, |
| per_device_eval_batch_size=8, |
| num_train_epochs=epochs, |
| weight_decay=0.01, |
| logging_steps=10, |
| save_strategy="epoch", |
| report_to=[], |
| ) |
|
|
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_dataset, |
| tokenizer=tokenizer, |
| ) |
| trainer.train() |
| trainer.save_model(str(output_dir)) |
| tokenizer.save_pretrained(str(output_dir)) |
| return output_dir |
|
|
|
|
| def main() -> int: |
| parser = argparse.ArgumentParser(description="Fine-tune recruiter NER model") |
| parser.add_argument("--annotations", required=True, help="Path to JSON/JSONL annotation file") |
| parser.add_argument("--output-dir", default="models/recruiter_ner", help="Training output directory") |
| parser.add_argument("--model-name", default=DEFAULT_MODEL, help="Base HF model to fine-tune") |
| parser.add_argument("--epochs", type=int, default=3, help="Training epochs") |
| parser.add_argument("--export-only", action="store_true", help="Prepare dataset without training") |
| args = parser.parse_args() |
|
|
| annotation_path = Path(args.annotations).expanduser().resolve() |
| output_dir = Path(args.output_dir).expanduser().resolve() |
|
|
| records = validate_records(load_annotations(annotation_path)) |
| if not records: |
| raise SystemExit("No valid annotation records found") |
|
|
| export_path = export_prepared_dataset(records, output_dir) |
| print(f"Prepared dataset exported to {export_path}") |
|
|
| if args.export_only or not TRANSFORMERS_AVAILABLE: |
| print("Training skipped: exporting only or transformers unavailable") |
| return 0 |
|
|
| trained_dir = train_model(records, args.model_name, output_dir, args.epochs) |
| print(f"Model trained and saved to {trained_dir}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|