""" Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching Étape 6 Optimization: Hybrid approach for maximum skill coverage """ from typing import List, Dict, Optional import json import os import re try: from transformers import pipeline NER_AVAILABLE = True except ImportError: NER_AVAILABLE = False from ai_module.nlp.skill_extractor import SkillExtractor from ai_module.matching.semantic_matcher import SemanticSkillMatcher try: from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor MULTILINGUAL_EXTRACTOR_AVAILABLE = True except Exception: MultilingualSkillExtractor = None MULTILINGUAL_EXTRACTOR_AVAILABLE = False class EnhancedSkillExtractor: """ Extract skills using hybrid approach: 1. BERT-based NER (95% accuracy) 2. Dictionary fuzzy matching (80% accuracy) 3. Intelligent merge with confidence scores """ def __init__(self, load_ner: bool = True): """Initialize both extraction methods""" self.skill_extractor = SkillExtractor() self.canonical_skills = list(self.skill_extractor.all_skills) self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62")) self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER") self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None if load_ner and NER_AVAILABLE: try: self.ner_pipeline = pipeline( "ner", model=self.ner_model_name, aggregation_strategy="simple" ) self.ner_available = True except Exception as e: print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.") self.ner_available = False else: self.ner_available = False def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]: """ Extract skills using both NER and dictionary methods Args: text: CV text threshold: Fuzzy matching threshold Returns: List of skills with source and confidence [ {"name": "Python", "source": "NER", "confidence": 0.95}, {"name": "React", "source": "DICT-FUZZY", "confidence": 0.80} ] """ all_skills = [] seen_skills = set() # Track added skills (lowercase) # Step 1: Extract via NER (if available) if self.ner_available: ner_skills = self._extract_via_ner(text) for skill_data in ner_skills: skill_name_lower = skill_data["name"].lower() if skill_name_lower not in seen_skills: all_skills.append(skill_data) seen_skills.add(skill_name_lower) # Step 2: Extract via Dictionary (for coverage) dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold) for dict_skill in dict_skills: skill_name_lower = dict_skill["name"].lower() if skill_name_lower not in seen_skills: # Enhance dictionary skill with additional metadata skill_data = { "name": dict_skill["name"], "source": "DICT-FUZZY", "confidence": 0.80, # Lower than NER "category": dict_skill.get("category", "tech"), "method": dict_skill.get("method", "fuzzy") } all_skills.append(skill_data) seen_skills.add(skill_name_lower) # Step 2b: Multilingual aliases (FR/EN/ES) for better recall. if self.multilingual_extractor is not None: multilingual_skills = self.multilingual_extractor.extract_skills(text) for skill_data in multilingual_skills: skill_name_lower = skill_data["name"].lower() if skill_name_lower not in seen_skills: all_skills.append(skill_data) seen_skills.add(skill_name_lower) # Step 3: Embedding-based normalization to canonical skill list. normalized = self._normalize_with_embeddings(all_skills) if normalized: all_skills = normalized # Sort by confidence descending all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True) return all_skills def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]: """Map extracted skill variants to nearest canonical skills via embeddings.""" if not skills: return [] normalized: List[Dict] = [] seen = set() for skill in skills: raw_name = str(skill.get("name", "")).strip() if not raw_name: continue nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1) if nearest: best_name, similarity = nearest[0] if similarity >= self.semantic_threshold: skill["normalized_name"] = best_name skill["normalization_similarity"] = round(similarity, 4) skill["name"] = best_name key = str(skill.get("name", "")).lower() if key in seen: continue seen.add(key) normalized.append(skill) return normalized def _extract_via_ner(self, text: str) -> List[Dict]: """Extract candidate skill entities from NER pipeline output.""" if not self.ner_available or not text: return [] try: # Keep runtime bounded. text_truncated = text[:2000] ner_results = self.ner_pipeline(text_truncated) ner_skills = [] for entity in ner_results: group = str(entity.get("entity_group", "")).upper() if group not in {"MISC", "ORG", "SKILL"}: continue if entity.get("score", 0) <= 0.70: continue skill_name = str(entity.get("word", "")).strip().replace("##", "") skill_name = re.sub(r"\s+", " ", skill_name) if len(skill_name) < 2: continue ner_skills.append({ "name": skill_name.title(), "source": "NER", "confidence": float(entity.get("score", 0.95)), "category": self._classify_skill(skill_name), "method": f"NER-{self.ner_model_name}" }) return ner_skills except Exception as e: print(f"⚠️ NER extraction error: {e}") return [] def _classify_skill(self, skill_name: str) -> str: """Classify skill into category""" skill_lower = skill_name.lower() # Check in existing skill categories category = self.skill_extractor.skill_categories.get(skill_lower, None) if category: return category # Smart classification based on keywords tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node", "aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb", "api", "rest", "graphql", "fastapi", "django", "flask"] soft_keywords = ["leadership", "communication", "teamwork", "management", "planning", "problem solving", "analytical", "creative", "adaptability"] language_keywords = ["english", "french", "spanish", "german", "italian", "arabic", "portuguese", "mandarin", "japanese"] if any(keyword in skill_lower for keyword in tech_keywords): return "tech" elif any(keyword in skill_lower for keyword in soft_keywords): return "soft" elif any(keyword in skill_lower for keyword in language_keywords): return "language" else: return "tech" # Default def get_extraction_stats(self, skills: List[Dict]) -> Dict: """Get statistics about extraction""" if not skills: return { "total": 0, "by_source": {}, "avg_confidence": 0, "coverage": 0 } by_source = {} for skill in skills: source = skill.get("source", "unknown") by_source[source] = by_source.get(source, 0) + 1 avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills) return { "total": len(skills), "by_source": by_source, "avg_confidence": round(avg_confidence, 3), "top_3": [s["name"] for s in skills[:3]] } # Usage example if __name__ == "__main__": extractor = EnhancedSkillExtractor() sample_text = """ Python developer with 5 years experience. Expert in FastAPI, Django, React, Docker. Strong communication and leadership skills. Fluent in English and French. """ skills = extractor.extract_skills_hybrid(sample_text) stats = extractor.get_extraction_stats(skills) print(f"Found {stats['total']} skills:") for skill in skills: print(f" - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}") print(f"\nStats: {json.dumps(stats, indent=2)}")