File size: 4,544 Bytes
9df97a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | #!/usr/bin/env python3
"""Prepare annotated NER data for recruiter-domain fine-tuning.
The script supports two modes:
- `template`: tokenize texts and emit O labels for manual annotation.
- `bio`: convert provided span annotations into BIO token labels.
Input formats:
- JSONL or JSON list with records containing `text` and optionally `spans`
where each span is `{start, end, label}`.
"""
from __future__ import annotations
import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Sequence
@dataclass(frozen=True)
class BioSpan:
start: int
end: int
label: str
TOKEN_RE = re.compile(r"\w+|[^\w\s]", re.UNICODE)
def load_records(path: Path) -> List[Dict[str, Any]]:
if not path.exists():
raise FileNotFoundError(path)
raw = path.read_text(encoding="utf-8").strip()
if not raw:
return []
if path.suffix.lower() == ".jsonl":
return [json.loads(line) for line in raw.splitlines() if line.strip()]
payload = json.loads(raw)
if isinstance(payload, list):
return payload
if isinstance(payload, dict) and isinstance(payload.get("data"), list):
return payload["data"]
raise ValueError("Unsupported annotation file format")
def tokenize_with_offsets(text: str) -> List[Dict[str, Any]]:
tokens = []
for match in TOKEN_RE.finditer(text):
tokens.append({"token": match.group(0), "start": match.start(), "end": match.end()})
return tokens
def normalize_spans(spans: Iterable[Dict[str, Any]]) -> List[BioSpan]:
normalized: List[BioSpan] = []
for span in spans:
try:
normalized.append(
BioSpan(
start=int(span["start"]),
end=int(span["end"]),
label=str(span["label"]),
)
)
except Exception:
continue
return sorted(normalized, key=lambda item: (item.start, item.end))
def spans_to_bio(text: str, spans: Sequence[BioSpan]) -> Dict[str, Any]:
tokens = tokenize_with_offsets(text)
labels = ["O"] * len(tokens)
for span in spans:
first_label_index = None
for index, token in enumerate(tokens):
overlaps = token["start"] < span.end and token["end"] > span.start
if not overlaps:
continue
prefix = "B-" if first_label_index is None else "I-"
labels[index] = f"{prefix}{span.label.upper()}"
if first_label_index is None:
first_label_index = index
return {
"text": text,
"tokens": [token["token"] for token in tokens],
"ner_tags": labels,
}
def build_template(text: str) -> Dict[str, Any]:
tokens = tokenize_with_offsets(text)
return {
"text": text,
"tokens": [token["token"] for token in tokens],
"ner_tags": ["O"] * len(tokens),
}
def prepare_annotations(records: Sequence[Dict[str, Any]], mode: str = "template") -> List[Dict[str, Any]]:
prepared: List[Dict[str, Any]] = []
for record in records:
text = str(record.get("text", "")).strip()
if not text:
continue
if mode == "bio":
spans = normalize_spans(record.get("spans", []))
prepared.append(spans_to_bio(text, spans))
else:
prepared.append(build_template(text))
return prepared
def write_jsonl(records: Sequence[Dict[str, Any]], path: Path) -> Path:
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("w", encoding="utf-8") as handle:
for record in records:
handle.write(json.dumps(record, ensure_ascii=False) + "\n")
return path
def main() -> int:
parser = argparse.ArgumentParser(description="Prepare annotated NER data")
parser.add_argument("--input", required=True, help="Path to source JSON/JSONL file")
parser.add_argument("--output", required=True, help="Output JSONL path")
parser.add_argument("--mode", choices=["template", "bio"], default="template")
args = parser.parse_args()
input_path = Path(args.input).expanduser().resolve()
output_path = Path(args.output).expanduser().resolve()
records = load_records(input_path)
prepared = prepare_annotations(records, mode=args.mode)
write_jsonl(prepared, output_path)
print(f"Prepared {len(prepared)} annotation records -> {output_path}")
return 0
if __name__ == "__main__":
raise SystemExit(main()) |