| """HF-based CV parser using token classification NER with safe fallbacks. |
| |
| This parser is designed as a lightweight modern replacement layer that can be |
| used before legacy extractors. It focuses on high-signal entities and keeps the |
| output schema close to the existing extraction pipeline. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
| from typing import Dict, List, Tuple |
|
|
| try: |
| from transformers import pipeline |
|
|
| HF_NER_AVAILABLE = True |
| except Exception: |
| HF_NER_AVAILABLE = False |
|
|
|
|
| class HFResumeNERParser: |
| """NER parser powered by Hugging Face token classification models. |
| |
| Default model can be overridden with env var `HF_CV_NER_MODEL`. |
| Recommended values: |
| - dslim/bert-base-NER |
| - Davlan/bert-base-multilingual-cased-ner-hrl |
| """ |
|
|
| EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}") |
| PHONE_RE = re.compile(r"(?:\+?\d[\d\s().-]{7,}\d)") |
|
|
| def __init__(self, model_name: str = "dslim/bert-base-NER") -> None: |
| self.model_name = model_name |
| self.ner = None |
| if HF_NER_AVAILABLE: |
| try: |
| self.ner = pipeline( |
| "ner", |
| model=self.model_name, |
| aggregation_strategy="simple", |
| ) |
| except Exception: |
| self.ner = None |
|
|
| @property |
| def available(self) -> bool: |
| return self.ner is not None |
|
|
| def extract_structured_profile(self, text: str) -> Tuple[Dict, float]: |
| """Extract minimal structured profile and quality score [0..100].""" |
| if not text: |
| return {}, 0.0 |
|
|
| entities = self._extract_entities(text) |
|
|
| emails = self.EMAIL_RE.findall(text) |
| phones = [p.strip() for p in self.PHONE_RE.findall(text)] |
|
|
| person_names = entities.get("PER", []) |
| organizations = entities.get("ORG", []) |
| locations = entities.get("LOC", []) |
| misc = entities.get("MISC", []) |
|
|
| profile = { |
| "full_name": person_names[0] if person_names else None, |
| "name": person_names[0] if person_names else None, |
| "emails": list(dict.fromkeys(emails)), |
| "email": emails[0] if emails else None, |
| "phones": list(dict.fromkeys(phones)), |
| "phone": phones[0] if phones else None, |
| "companies": list(dict.fromkeys(organizations[:10])), |
| "job_titles": list(dict.fromkeys(misc[:10])), |
| "education": [], |
| "skills": [], |
| "locations": list(dict.fromkeys(locations[:10])), |
| "extraction_metadata": { |
| "model": self.model_name, |
| "total_entities": sum(len(v) for v in entities.values()), |
| "entity_groups": {k: len(v) for k, v in entities.items()}, |
| }, |
| } |
|
|
| quality = 0.0 |
| if profile["full_name"]: |
| quality += 20 |
| if profile["email"]: |
| quality += 20 |
| if profile["phone"]: |
| quality += 10 |
| if profile["companies"]: |
| quality += 20 |
| if profile["job_titles"]: |
| quality += 15 |
| if profile["locations"]: |
| quality += 10 |
| if profile["extraction_metadata"]["total_entities"] > 0: |
| quality += 5 |
|
|
| return profile, min(100.0, quality) |
|
|
| def _extract_entities(self, text: str) -> Dict[str, List[str]]: |
| groups: Dict[str, List[str]] = {"PER": [], "ORG": [], "LOC": [], "MISC": []} |
| if not self.ner: |
| return groups |
|
|
| try: |
| |
| chunks = [text[i : i + 1600] for i in range(0, min(len(text), 9600), 1600)] |
| for chunk in chunks: |
| for entity in self.ner(chunk): |
| label = str(entity.get("entity_group", "MISC")) |
| word = str(entity.get("word", "")).strip() |
| score = float(entity.get("score", 0.0)) |
| if not word or score < 0.60: |
| continue |
| if label not in groups: |
| label = "MISC" |
| if word not in groups[label]: |
| groups[label].append(word) |
| except Exception: |
| return groups |
|
|
| return groups |
|
|