Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / scripts /prepare_ner_annotations.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 10 days ago

4.54 kB

	#!/usr/bin/env python3
	"""Prepare annotated NER data for recruiter-domain fine-tuning.

	The script supports two modes:
	- `template`: tokenize texts and emit O labels for manual annotation.
	- `bio`: convert provided span annotations into BIO token labels.

	Input formats:
	- JSONL or JSON list with records containing `text` and optionally `spans`
	where each span is `{start, end, label}`.
	"""

	from __future__ import annotations

	import argparse
	import json
	import re
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Dict, Iterable, List, Sequence


	@dataclass(frozen=True)
	class BioSpan:
	start: int
	end: int
	label: str


	TOKEN_RE = re.compile(r"\w+\|[^\w\s]", re.UNICODE)


	def load_records(path: Path) -> List[Dict[str, Any]]:
	if not path.exists():
	raise FileNotFoundError(path)

	raw = path.read_text(encoding="utf-8").strip()
	if not raw:
	return []

	if path.suffix.lower() == ".jsonl":
	return [json.loads(line) for line in raw.splitlines() if line.strip()]

	payload = json.loads(raw)
	if isinstance(payload, list):
	return payload
	if isinstance(payload, dict) and isinstance(payload.get("data"), list):
	return payload["data"]
	raise ValueError("Unsupported annotation file format")


	def tokenize_with_offsets(text: str) -> List[Dict[str, Any]]:
	tokens = []
	for match in TOKEN_RE.finditer(text):
	tokens.append({"token": match.group(0), "start": match.start(), "end": match.end()})
	return tokens


	def normalize_spans(spans: Iterable[Dict[str, Any]]) -> List[BioSpan]:
	normalized: List[BioSpan] = []
	for span in spans:
	try:
	normalized.append(
	BioSpan(
	start=int(span["start"]),
	end=int(span["end"]),
	label=str(span["label"]),
	)
	)
	except Exception:
	continue
	return sorted(normalized, key=lambda item: (item.start, item.end))


	def spans_to_bio(text: str, spans: Sequence[BioSpan]) -> Dict[str, Any]:
	tokens = tokenize_with_offsets(text)
	labels = ["O"] * len(tokens)

	for span in spans:
	first_label_index = None
	for index, token in enumerate(tokens):
	overlaps = token["start"] < span.end and token["end"] > span.start
	if not overlaps:
	continue
	prefix = "B-" if first_label_index is None else "I-"
	labels[index] = f"{prefix}{span.label.upper()}"
	if first_label_index is None:
	first_label_index = index

	return {
	"text": text,
	"tokens": [token["token"] for token in tokens],
	"ner_tags": labels,
	}


	def build_template(text: str) -> Dict[str, Any]:
	tokens = tokenize_with_offsets(text)
	return {
	"text": text,
	"tokens": [token["token"] for token in tokens],
	"ner_tags": ["O"] * len(tokens),
	}


	def prepare_annotations(records: Sequence[Dict[str, Any]], mode: str = "template") -> List[Dict[str, Any]]:
	prepared: List[Dict[str, Any]] = []
	for record in records:
	text = str(record.get("text", "")).strip()
	if not text:
	continue

	if mode == "bio":
	spans = normalize_spans(record.get("spans", []))
	prepared.append(spans_to_bio(text, spans))
	else:
	prepared.append(build_template(text))

	return prepared


	def write_jsonl(records: Sequence[Dict[str, Any]], path: Path) -> Path:
	path.parent.mkdir(parents=True, exist_ok=True)
	with path.open("w", encoding="utf-8") as handle:
	for record in records:
	handle.write(json.dumps(record, ensure_ascii=False) + "\n")
	return path


	def main() -> int:
	parser = argparse.ArgumentParser(description="Prepare annotated NER data")
	parser.add_argument("--input", required=True, help="Path to source JSON/JSONL file")
	parser.add_argument("--output", required=True, help="Output JSONL path")
	parser.add_argument("--mode", choices=["template", "bio"], default="template")
	args = parser.parse_args()

	input_path = Path(args.input).expanduser().resolve()
	output_path = Path(args.output).expanduser().resolve()

	records = load_records(input_path)
	prepared = prepare_annotations(records, mode=args.mode)
	write_jsonl(prepared, output_path)

	print(f"Prepared {len(prepared)} annotation records -> {output_path}")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())