Spaces:

RHmaster
/

ai-talent-finder-backend

Running

File size: 9,789 Bytes

9df97a2

"""
Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching
Étape 6 Optimization: Hybrid approach for maximum skill coverage
"""

from typing import List, Dict, Optional
import json
import os
import re

try:
    from transformers import pipeline
    NER_AVAILABLE = True
except ImportError:
    NER_AVAILABLE = False

from ai_module.nlp.skill_extractor import SkillExtractor
from ai_module.matching.semantic_matcher import SemanticSkillMatcher

try:
    from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor
    MULTILINGUAL_EXTRACTOR_AVAILABLE = True
except Exception:
    MultilingualSkillExtractor = None
    MULTILINGUAL_EXTRACTOR_AVAILABLE = False


class EnhancedSkillExtractor:
    """
    Extract skills using hybrid approach:
    1. BERT-based NER (95% accuracy)
    2. Dictionary fuzzy matching (80% accuracy)
    3. Intelligent merge with confidence scores
    """
    
    def __init__(self, load_ner: bool = True):
        """Initialize both extraction methods"""
        self.skill_extractor = SkillExtractor()
        self.canonical_skills = list(self.skill_extractor.all_skills)
        self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62"))
        self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER")
        self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None
        
        if load_ner and NER_AVAILABLE:
            try:
                self.ner_pipeline = pipeline(
                    "ner",
                    model=self.ner_model_name,
                    aggregation_strategy="simple"
                )
                self.ner_available = True
            except Exception as e:
                print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.")
                self.ner_available = False
        else:
            self.ner_available = False
    
    def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]:
        """
        Extract skills using both NER and dictionary methods
        
        Args:
            text: CV text
            threshold: Fuzzy matching threshold
        
        Returns:
            List of skills with source and confidence
            [
                {"name": "Python", "source": "NER", "confidence": 0.95},
                {"name": "React", "source": "DICT-FUZZY", "confidence": 0.80}
            ]
        """
        all_skills = []
        seen_skills = set()  # Track added skills (lowercase)
        
        # Step 1: Extract via NER (if available)
        if self.ner_available:
            ner_skills = self._extract_via_ner(text)
            for skill_data in ner_skills:
                skill_name_lower = skill_data["name"].lower()
                if skill_name_lower not in seen_skills:
                    all_skills.append(skill_data)
                    seen_skills.add(skill_name_lower)
        
        # Step 2: Extract via Dictionary (for coverage)
        dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold)
        for dict_skill in dict_skills:
            skill_name_lower = dict_skill["name"].lower()
            if skill_name_lower not in seen_skills:
                # Enhance dictionary skill with additional metadata
                skill_data = {
                    "name": dict_skill["name"],
                    "source": "DICT-FUZZY",
                    "confidence": 0.80,  # Lower than NER
                    "category": dict_skill.get("category", "tech"),
                    "method": dict_skill.get("method", "fuzzy")
                }
                all_skills.append(skill_data)
                seen_skills.add(skill_name_lower)

        # Step 2b: Multilingual aliases (FR/EN/ES) for better recall.
        if self.multilingual_extractor is not None:
            multilingual_skills = self.multilingual_extractor.extract_skills(text)
            for skill_data in multilingual_skills:
                skill_name_lower = skill_data["name"].lower()
                if skill_name_lower not in seen_skills:
                    all_skills.append(skill_data)
                    seen_skills.add(skill_name_lower)

        # Step 3: Embedding-based normalization to canonical skill list.
        normalized = self._normalize_with_embeddings(all_skills)
        if normalized:
            all_skills = normalized
        
        # Sort by confidence descending
        all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True)
        
        return all_skills

    def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]:
        """Map extracted skill variants to nearest canonical skills via embeddings."""
        if not skills:
            return []

        normalized: List[Dict] = []
        seen = set()

        for skill in skills:
            raw_name = str(skill.get("name", "")).strip()
            if not raw_name:
                continue

            nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1)
            if nearest:
                best_name, similarity = nearest[0]
                if similarity >= self.semantic_threshold:
                    skill["normalized_name"] = best_name
                    skill["normalization_similarity"] = round(similarity, 4)
                    skill["name"] = best_name

            key = str(skill.get("name", "")).lower()
            if key in seen:
                continue
            seen.add(key)
            normalized.append(skill)

        return normalized
    
    def _extract_via_ner(self, text: str) -> List[Dict]:
        """Extract candidate skill entities from NER pipeline output."""
        if not self.ner_available or not text:
            return []
        
        try:
            # Keep runtime bounded.
            text_truncated = text[:2000]
            
            ner_results = self.ner_pipeline(text_truncated)
            
            ner_skills = []
            for entity in ner_results:
                group = str(entity.get("entity_group", "")).upper()
                if group not in {"MISC", "ORG", "SKILL"}:
                    continue
                if entity.get("score", 0) <= 0.70:
                    continue

                skill_name = str(entity.get("word", "")).strip().replace("##", "")
                skill_name = re.sub(r"\s+", " ", skill_name)
                if len(skill_name) < 2:
                    continue

                ner_skills.append({
                    "name": skill_name.title(),
                    "source": "NER",
                    "confidence": float(entity.get("score", 0.95)),
                    "category": self._classify_skill(skill_name),
                    "method": f"NER-{self.ner_model_name}"
                })
            
            return ner_skills
        except Exception as e:
            print(f"⚠️ NER extraction error: {e}")
            return []
    
    def _classify_skill(self, skill_name: str) -> str:
        """Classify skill into category"""
        skill_lower = skill_name.lower()
        
        # Check in existing skill categories
        category = self.skill_extractor.skill_categories.get(skill_lower, None)
        if category:
            return category
        
        # Smart classification based on keywords
        tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node",
                        "aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb",
                        "api", "rest", "graphql", "fastapi", "django", "flask"]
        
        soft_keywords = ["leadership", "communication", "teamwork", "management", "planning",
                        "problem solving", "analytical", "creative", "adaptability"]
        
        language_keywords = ["english", "french", "spanish", "german", "italian", "arabic",
                            "portuguese", "mandarin", "japanese"]
        
        if any(keyword in skill_lower for keyword in tech_keywords):
            return "tech"
        elif any(keyword in skill_lower for keyword in soft_keywords):
            return "soft"
        elif any(keyword in skill_lower for keyword in language_keywords):
            return "language"
        else:
            return "tech"  # Default
    
    def get_extraction_stats(self, skills: List[Dict]) -> Dict:
        """Get statistics about extraction"""
        if not skills:
            return {
                "total": 0,
                "by_source": {},
                "avg_confidence": 0,
                "coverage": 0
            }
        
        by_source = {}
        for skill in skills:
            source = skill.get("source", "unknown")
            by_source[source] = by_source.get(source, 0) + 1
        
        avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills)
        
        return {
            "total": len(skills),
            "by_source": by_source,
            "avg_confidence": round(avg_confidence, 3),
            "top_3": [s["name"] for s in skills[:3]]
        }


# Usage example
if __name__ == "__main__":
    extractor = EnhancedSkillExtractor()
    
    sample_text = """
    Python developer with 5 years experience.
    Expert in FastAPI, Django, React, Docker.
    Strong communication and leadership skills.
    Fluent in English and French.
    """
    
    skills = extractor.extract_skills_hybrid(sample_text)
    stats = extractor.get_extraction_stats(skills)
    
    print(f"Found {stats['total']} skills:")
    for skill in skills:
        print(f"  - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}")
    
    print(f"\nStats: {json.dumps(stats, indent=2)}")