Spaces:

RHmaster
/

ai-talent-finder-backend

Running

File size: 4,544 Bytes

9df97a2

#!/usr/bin/env python3
"""Prepare annotated NER data for recruiter-domain fine-tuning.

The script supports two modes:
- `template`: tokenize texts and emit O labels for manual annotation.
- `bio`: convert provided span annotations into BIO token labels.

Input formats:
- JSONL or JSON list with records containing `text` and optionally `spans`
  where each span is `{start, end, label}`.
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Sequence


@dataclass(frozen=True)
class BioSpan:
    start: int
    end: int
    label: str


TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)


def load_records(path: Path) -> List[Dict[str, Any]]:
    if not path.exists():
        raise FileNotFoundError(path)

    raw = path.read_text(encoding="utf-8").strip()
    if not raw:
        return []

    if path.suffix.lower() == ".jsonl":
        return [json.loads(line) for line in raw.splitlines() if line.strip()]

    payload = json.loads(raw)
    if isinstance(payload, list):
        return payload
    if isinstance(payload, dict) and isinstance(payload.get("data"), list):
        return payload["data"]
    raise ValueError("Unsupported annotation file format")


def tokenize_with_offsets(text: str) -> List[Dict[str, Any]]:
    tokens = []
    for match in TOKEN_RE.finditer(text):
        tokens.append({"token": match.group(0), "start": match.start(), "end": match.end()})
    return tokens


def normalize_spans(spans: Iterable[Dict[str, Any]]) -> List[BioSpan]:
    normalized: List[BioSpan] = []
    for span in spans:
        try:
            normalized.append(
                BioSpan(
                    start=int(span["start"]),
                    end=int(span["end"]),
                    label=str(span["label"]),
                )
            )
        except Exception:
            continue
    return sorted(normalized, key=lambda item: (item.start, item.end))


def spans_to_bio(text: str, spans: Sequence[BioSpan]) -> Dict[str, Any]:
    tokens = tokenize_with_offsets(text)
    labels = ["O"] * len(tokens)

    for span in spans:
        first_label_index = None
        for index, token in enumerate(tokens):
            overlaps = token["start"] < span.end and token["end"] > span.start
            if not overlaps:
                continue
            prefix = "B-" if first_label_index is None else "I-"
            labels[index] = f"{prefix}{span.label.upper()}"
            if first_label_index is None:
                first_label_index = index

    return {
        "text": text,
        "tokens": [token["token"] for token in tokens],
        "ner_tags": labels,
    }


def build_template(text: str) -> Dict[str, Any]:
    tokens = tokenize_with_offsets(text)
    return {
        "text": text,
        "tokens": [token["token"] for token in tokens],
        "ner_tags": ["O"] * len(tokens),
    }


def prepare_annotations(records: Sequence[Dict[str, Any]], mode: str = "template") -> List[Dict[str, Any]]:
    prepared: List[Dict[str, Any]] = []
    for record in records:
        text = str(record.get("text", "")).strip()
        if not text:
            continue

        if mode == "bio":
            spans = normalize_spans(record.get("spans", []))
            prepared.append(spans_to_bio(text, spans))
        else:
            prepared.append(build_template(text))

    return prepared


def write_jsonl(records: Sequence[Dict[str, Any]], path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for record in records:
            handle.write(json.dumps(record, ensure_ascii=False) + "\n")
    return path


def main() -> int:
    parser = argparse.ArgumentParser(description="Prepare annotated NER data")
    parser.add_argument("--input", required=True, help="Path to source JSON/JSONL file")
    parser.add_argument("--output", required=True, help="Output JSONL path")
    parser.add_argument("--mode", choices=["template", "bio"], default="template")
    args = parser.parse_args()

    input_path = Path(args.input).expanduser().resolve()
    output_path = Path(args.output).expanduser().resolve()

    records = load_records(input_path)
    prepared = prepare_annotations(records, mode=args.mode)
    write_jsonl(prepared, output_path)

    print(f"Prepared {len(prepared)} annotation records -> {output_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())