#!/usr/bin/env python3 """Prepare annotated NER data for recruiter-domain fine-tuning. The script supports two modes: - `template`: tokenize texts and emit O labels for manual annotation. - `bio`: convert provided span annotations into BIO token labels. Input formats: - JSONL or JSON list with records containing `text` and optionally `spans` where each span is `{start, end, label}`. """ from __future__ import annotations import argparse import json import re from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, Iterable, List, Sequence @dataclass(frozen=True) class BioSpan: start: int end: int label: str TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE) def load_records(path: Path) -> List[Dict[str, Any]]: if not path.exists(): raise FileNotFoundError(path) raw = path.read_text(encoding="utf-8").strip() if not raw: return [] if path.suffix.lower() == ".jsonl": return [json.loads(line) for line in raw.splitlines() if line.strip()] payload = json.loads(raw) if isinstance(payload, list): return payload if isinstance(payload, dict) and isinstance(payload.get("data"), list): return payload["data"] raise ValueError("Unsupported annotation file format") def tokenize_with_offsets(text: str) -> List[Dict[str, Any]]: tokens = [] for match in TOKEN_RE.finditer(text): tokens.append({"token": match.group(0), "start": match.start(), "end": match.end()}) return tokens def normalize_spans(spans: Iterable[Dict[str, Any]]) -> List[BioSpan]: normalized: List[BioSpan] = [] for span in spans: try: normalized.append( BioSpan( start=int(span["start"]), end=int(span["end"]), label=str(span["label"]), ) ) except Exception: continue return sorted(normalized, key=lambda item: (item.start, item.end)) def spans_to_bio(text: str, spans: Sequence[BioSpan]) -> Dict[str, Any]: tokens = tokenize_with_offsets(text) labels = ["O"] * len(tokens) for span in spans: first_label_index = None for index, token in enumerate(tokens): overlaps = token["start"] < span.end and token["end"] > span.start if not overlaps: continue prefix = "B-" if first_label_index is None else "I-" labels[index] = f"{prefix}{span.label.upper()}" if first_label_index is None: first_label_index = index return { "text": text, "tokens": [token["token"] for token in tokens], "ner_tags": labels, } def build_template(text: str) -> Dict[str, Any]: tokens = tokenize_with_offsets(text) return { "text": text, "tokens": [token["token"] for token in tokens], "ner_tags": ["O"] * len(tokens), } def prepare_annotations(records: Sequence[Dict[str, Any]], mode: str = "template") -> List[Dict[str, Any]]: prepared: List[Dict[str, Any]] = [] for record in records: text = str(record.get("text", "")).strip() if not text: continue if mode == "bio": spans = normalize_spans(record.get("spans", [])) prepared.append(spans_to_bio(text, spans)) else: prepared.append(build_template(text)) return prepared def write_jsonl(records: Sequence[Dict[str, Any]], path: Path) -> Path: path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", encoding="utf-8") as handle: for record in records: handle.write(json.dumps(record, ensure_ascii=False) + "\n") return path def main() -> int: parser = argparse.ArgumentParser(description="Prepare annotated NER data") parser.add_argument("--input", required=True, help="Path to source JSON/JSONL file") parser.add_argument("--output", required=True, help="Output JSONL path") parser.add_argument("--mode", choices=["template", "bio"], default="template") args = parser.parse_args() input_path = Path(args.input).expanduser().resolve() output_path = Path(args.output).expanduser().resolve() records = load_records(input_path) prepared = prepare_annotations(records, mode=args.mode) write_jsonl(prepared, output_path) print(f"Prepared {len(prepared)} annotation records -> {output_path}") return 0 if __name__ == "__main__": raise SystemExit(main())