Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / ai_module /nlp /cv_parser.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 10 days ago

4.28 kB

	"""HF-based CV parser using token classification NER with safe fallbacks.

	This parser is designed as a lightweight modern replacement layer that can be
	used before legacy extractors. It focuses on high-signal entities and keeps the
	output schema close to the existing extraction pipeline.
	"""

	from __future__ import annotations

	import re
	from typing import Dict, List, Tuple

	try:
	from transformers import pipeline

	HF_NER_AVAILABLE = True
	except Exception:
	HF_NER_AVAILABLE = False


	class HFResumeNERParser:
	"""NER parser powered by Hugging Face token classification models.

	Default model can be overridden with env var `HF_CV_NER_MODEL`.
	Recommended values:
	- dslim/bert-base-NER
	- Davlan/bert-base-multilingual-cased-ner-hrl
	"""

	EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
	PHONE_RE = re.compile(r"(?:\+?\d[\d\s().-]{7,}\d)")

	def __init__(self, model_name: str = "dslim/bert-base-NER") -> None:
	self.model_name = model_name
	self.ner = None
	if HF_NER_AVAILABLE:
	try:
	self.ner = pipeline(
	"ner",
	model=self.model_name,
	aggregation_strategy="simple",
	)
	except Exception:
	self.ner = None

	@property
	def available(self) -> bool:
	return self.ner is not None

	def extract_structured_profile(self, text: str) -> Tuple[Dict, float]:
	"""Extract minimal structured profile and quality score [0..100]."""
	if not text:
	return {}, 0.0

	entities = self._extract_entities(text)

	emails = self.EMAIL_RE.findall(text)
	phones = [p.strip() for p in self.PHONE_RE.findall(text)]

	person_names = entities.get("PER", [])
	organizations = entities.get("ORG", [])
	locations = entities.get("LOC", [])
	misc = entities.get("MISC", [])

	profile = {
	"full_name": person_names[0] if person_names else None,
	"name": person_names[0] if person_names else None,
	"emails": list(dict.fromkeys(emails)),
	"email": emails[0] if emails else None,
	"phones": list(dict.fromkeys(phones)),
	"phone": phones[0] if phones else None,
	"companies": list(dict.fromkeys(organizations[:10])),
	"job_titles": list(dict.fromkeys(misc[:10])),
	"education": [],
	"skills": [],
	"locations": list(dict.fromkeys(locations[:10])),
	"extraction_metadata": {
	"model": self.model_name,
	"total_entities": sum(len(v) for v in entities.values()),
	"entity_groups": {k: len(v) for k, v in entities.items()},
	},
	}

	quality = 0.0
	if profile["full_name"]:
	quality += 20
	if profile["email"]:
	quality += 20
	if profile["phone"]:
	quality += 10
	if profile["companies"]:
	quality += 20
	if profile["job_titles"]:
	quality += 15
	if profile["locations"]:
	quality += 10
	if profile["extraction_metadata"]["total_entities"] > 0:
	quality += 5

	return profile, min(100.0, quality)

	def _extract_entities(self, text: str) -> Dict[str, List[str]]:
	groups: Dict[str, List[str]] = {"PER": [], "ORG": [], "LOC": [], "MISC": []}
	if not self.ner:
	return groups

	try:
	# Keep runtime bounded on long CVs.
	chunks = [text[i : i + 1600] for i in range(0, min(len(text), 9600), 1600)]
	for chunk in chunks:
	for entity in self.ner(chunk):
	label = str(entity.get("entity_group", "MISC"))
	word = str(entity.get("word", "")).strip()
	score = float(entity.get("score", 0.0))
	if not word or score < 0.60:
	continue
	if label not in groups:
	label = "MISC"
	if word not in groups[label]:
	groups[label].append(word)
	except Exception:
	return groups

	return groups