File size: 4,544 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/usr/bin/env python3
"""Prepare annotated NER data for recruiter-domain fine-tuning.

The script supports two modes:
- `template`: tokenize texts and emit O labels for manual annotation.
- `bio`: convert provided span annotations into BIO token labels.

Input formats:
- JSONL or JSON list with records containing `text` and optionally `spans`
  where each span is `{start, end, label}`.
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Sequence


@dataclass(frozen=True)
class BioSpan:
    start: int
    end: int
    label: str


TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)


def load_records(path: Path) -> List[Dict[str, Any]]:
    if not path.exists():
        raise FileNotFoundError(path)

    raw = path.read_text(encoding="utf-8").strip()
    if not raw:
        return []

    if path.suffix.lower() == ".jsonl":
        return [json.loads(line) for line in raw.splitlines() if line.strip()]

    payload = json.loads(raw)
    if isinstance(payload, list):
        return payload
    if isinstance(payload, dict) and isinstance(payload.get("data"), list):
        return payload["data"]
    raise ValueError("Unsupported annotation file format")


def tokenize_with_offsets(text: str) -> List[Dict[str, Any]]:
    tokens = []
    for match in TOKEN_RE.finditer(text):
        tokens.append({"token": match.group(0), "start": match.start(), "end": match.end()})
    return tokens


def normalize_spans(spans: Iterable[Dict[str, Any]]) -> List[BioSpan]:
    normalized: List[BioSpan] = []
    for span in spans:
        try:
            normalized.append(
                BioSpan(
                    start=int(span["start"]),
                    end=int(span["end"]),
                    label=str(span["label"]),
                )
            )
        except Exception:
            continue
    return sorted(normalized, key=lambda item: (item.start, item.end))


def spans_to_bio(text: str, spans: Sequence[BioSpan]) -> Dict[str, Any]:
    tokens = tokenize_with_offsets(text)
    labels = ["O"] * len(tokens)

    for span in spans:
        first_label_index = None
        for index, token in enumerate(tokens):
            overlaps = token["start"] < span.end and token["end"] > span.start
            if not overlaps:
                continue
            prefix = "B-" if first_label_index is None else "I-"
            labels[index] = f"{prefix}{span.label.upper()}"
            if first_label_index is None:
                first_label_index = index

    return {
        "text": text,
        "tokens": [token["token"] for token in tokens],
        "ner_tags": labels,
    }


def build_template(text: str) -> Dict[str, Any]:
    tokens = tokenize_with_offsets(text)
    return {
        "text": text,
        "tokens": [token["token"] for token in tokens],
        "ner_tags": ["O"] * len(tokens),
    }


def prepare_annotations(records: Sequence[Dict[str, Any]], mode: str = "template") -> List[Dict[str, Any]]:
    prepared: List[Dict[str, Any]] = []
    for record in records:
        text = str(record.get("text", "")).strip()
        if not text:
            continue

        if mode == "bio":
            spans = normalize_spans(record.get("spans", []))
            prepared.append(spans_to_bio(text, spans))
        else:
            prepared.append(build_template(text))

    return prepared


def write_jsonl(records: Sequence[Dict[str, Any]], path: Path) -> Path:
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as handle:
        for record in records:
            handle.write(json.dumps(record, ensure_ascii=False) + "\n")
    return path


def main() -> int:
    parser = argparse.ArgumentParser(description="Prepare annotated NER data")
    parser.add_argument("--input", required=True, help="Path to source JSON/JSONL file")
    parser.add_argument("--output", required=True, help="Output JSONL path")
    parser.add_argument("--mode", choices=["template", "bio"], default="template")
    args = parser.parse_args()

    input_path = Path(args.input).expanduser().resolve()
    output_path = Path(args.output).expanduser().resolve()

    records = load_records(input_path)
    prepared = prepare_annotations(records, mode=args.mode)
    write_jsonl(prepared, output_path)

    print(f"Prepared {len(prepared)} annotation records -> {output_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())