Spaces:

RHmaster
/

ai-talent-finder-backend

Running

File size: 4,276 Bytes

9df97a2

"""HF-based CV parser using token classification NER with safe fallbacks.

This parser is designed as a lightweight modern replacement layer that can be
used before legacy extractors. It focuses on high-signal entities and keeps the
output schema close to the existing extraction pipeline.
"""

from __future__ import annotations

import re
from typing import Dict, List, Tuple

try:
    from transformers import pipeline

    HF_NER_AVAILABLE = True
except Exception:
    HF_NER_AVAILABLE = False


class HFResumeNERParser:
    """NER parser powered by Hugging Face token classification models.

    Default model can be overridden with env var `HF_CV_NER_MODEL`.
    Recommended values:
    - dslim/bert-base-NER
    - Davlan/bert-base-multilingual-cased-ner-hrl
    """

    EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
    PHONE_RE = re.compile(r"(?:\+?\d[\d\s().-]{7,}\d)")

    def __init__(self, model_name: str = "dslim/bert-base-NER") -> None:
        self.model_name = model_name
        self.ner = None
        if HF_NER_AVAILABLE:
            try:
                self.ner = pipeline(
                    "ner",
                    model=self.model_name,
                    aggregation_strategy="simple",
                )
            except Exception:
                self.ner = None

    @property
    def available(self) -> bool:
        return self.ner is not None

    def extract_structured_profile(self, text: str) -> Tuple[Dict, float]:
        """Extract minimal structured profile and quality score [0..100]."""
        if not text:
            return {}, 0.0

        entities = self._extract_entities(text)

        emails = self.EMAIL_RE.findall(text)
        phones = [p.strip() for p in self.PHONE_RE.findall(text)]

        person_names = entities.get("PER", [])
        organizations = entities.get("ORG", [])
        locations = entities.get("LOC", [])
        misc = entities.get("MISC", [])

        profile = {
            "full_name": person_names[0] if person_names else None,
            "name": person_names[0] if person_names else None,
            "emails": list(dict.fromkeys(emails)),
            "email": emails[0] if emails else None,
            "phones": list(dict.fromkeys(phones)),
            "phone": phones[0] if phones else None,
            "companies": list(dict.fromkeys(organizations[:10])),
            "job_titles": list(dict.fromkeys(misc[:10])),
            "education": [],
            "skills": [],
            "locations": list(dict.fromkeys(locations[:10])),
            "extraction_metadata": {
                "model": self.model_name,
                "total_entities": sum(len(v) for v in entities.values()),
                "entity_groups": {k: len(v) for k, v in entities.items()},
            },
        }

        quality = 0.0
        if profile["full_name"]:
            quality += 20
        if profile["email"]:
            quality += 20
        if profile["phone"]:
            quality += 10
        if profile["companies"]:
            quality += 20
        if profile["job_titles"]:
            quality += 15
        if profile["locations"]:
            quality += 10
        if profile["extraction_metadata"]["total_entities"] > 0:
            quality += 5

        return profile, min(100.0, quality)

    def _extract_entities(self, text: str) -> Dict[str, List[str]]:
        groups: Dict[str, List[str]] = {"PER": [], "ORG": [], "LOC": [], "MISC": []}
        if not self.ner:
            return groups

        try:
            # Keep runtime bounded on long CVs.
            chunks = [text[i : i + 1600] for i in range(0, min(len(text), 9600), 1600)]
            for chunk in chunks:
                for entity in self.ner(chunk):
                    label = str(entity.get("entity_group", "MISC"))
                    word = str(entity.get("word", "")).strip()
                    score = float(entity.get("score", 0.0))
                    if not word or score < 0.60:
                        continue
                    if label not in groups:
                        label = "MISC"
                    if word not in groups[label]:
                        groups[label].append(word)
        except Exception:
            return groups

        return groups