| """ |
| Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching |
| Étape 6 Optimization: Hybrid approach for maximum skill coverage |
| """ |
|
|
| from typing import List, Dict, Optional |
| import json |
| import os |
| import re |
|
|
| try: |
| from transformers import pipeline |
| NER_AVAILABLE = True |
| except ImportError: |
| NER_AVAILABLE = False |
|
|
| from ai_module.nlp.skill_extractor import SkillExtractor |
| from ai_module.matching.semantic_matcher import SemanticSkillMatcher |
|
|
| try: |
| from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor |
| MULTILINGUAL_EXTRACTOR_AVAILABLE = True |
| except Exception: |
| MultilingualSkillExtractor = None |
| MULTILINGUAL_EXTRACTOR_AVAILABLE = False |
|
|
|
|
| class EnhancedSkillExtractor: |
| """ |
| Extract skills using hybrid approach: |
| 1. BERT-based NER (95% accuracy) |
| 2. Dictionary fuzzy matching (80% accuracy) |
| 3. Intelligent merge with confidence scores |
| """ |
| |
| def __init__(self, load_ner: bool = True): |
| """Initialize both extraction methods""" |
| self.skill_extractor = SkillExtractor() |
| self.canonical_skills = list(self.skill_extractor.all_skills) |
| self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62")) |
| self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER") |
| self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None |
| |
| if load_ner and NER_AVAILABLE: |
| try: |
| self.ner_pipeline = pipeline( |
| "ner", |
| model=self.ner_model_name, |
| aggregation_strategy="simple" |
| ) |
| self.ner_available = True |
| except Exception as e: |
| print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.") |
| self.ner_available = False |
| else: |
| self.ner_available = False |
| |
| def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]: |
| """ |
| Extract skills using both NER and dictionary methods |
| |
| Args: |
| text: CV text |
| threshold: Fuzzy matching threshold |
| |
| Returns: |
| List of skills with source and confidence |
| [ |
| {"name": "Python", "source": "NER", "confidence": 0.95}, |
| {"name": "React", "source": "DICT-FUZZY", "confidence": 0.80} |
| ] |
| """ |
| all_skills = [] |
| seen_skills = set() |
| |
| |
| if self.ner_available: |
| ner_skills = self._extract_via_ner(text) |
| for skill_data in ner_skills: |
| skill_name_lower = skill_data["name"].lower() |
| if skill_name_lower not in seen_skills: |
| all_skills.append(skill_data) |
| seen_skills.add(skill_name_lower) |
| |
| |
| dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold) |
| for dict_skill in dict_skills: |
| skill_name_lower = dict_skill["name"].lower() |
| if skill_name_lower not in seen_skills: |
| |
| skill_data = { |
| "name": dict_skill["name"], |
| "source": "DICT-FUZZY", |
| "confidence": 0.80, |
| "category": dict_skill.get("category", "tech"), |
| "method": dict_skill.get("method", "fuzzy") |
| } |
| all_skills.append(skill_data) |
| seen_skills.add(skill_name_lower) |
|
|
| |
| if self.multilingual_extractor is not None: |
| multilingual_skills = self.multilingual_extractor.extract_skills(text) |
| for skill_data in multilingual_skills: |
| skill_name_lower = skill_data["name"].lower() |
| if skill_name_lower not in seen_skills: |
| all_skills.append(skill_data) |
| seen_skills.add(skill_name_lower) |
|
|
| |
| normalized = self._normalize_with_embeddings(all_skills) |
| if normalized: |
| all_skills = normalized |
| |
| |
| all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True) |
| |
| return all_skills |
|
|
| def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]: |
| """Map extracted skill variants to nearest canonical skills via embeddings.""" |
| if not skills: |
| return [] |
|
|
| normalized: List[Dict] = [] |
| seen = set() |
|
|
| for skill in skills: |
| raw_name = str(skill.get("name", "")).strip() |
| if not raw_name: |
| continue |
|
|
| nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1) |
| if nearest: |
| best_name, similarity = nearest[0] |
| if similarity >= self.semantic_threshold: |
| skill["normalized_name"] = best_name |
| skill["normalization_similarity"] = round(similarity, 4) |
| skill["name"] = best_name |
|
|
| key = str(skill.get("name", "")).lower() |
| if key in seen: |
| continue |
| seen.add(key) |
| normalized.append(skill) |
|
|
| return normalized |
| |
| def _extract_via_ner(self, text: str) -> List[Dict]: |
| """Extract candidate skill entities from NER pipeline output.""" |
| if not self.ner_available or not text: |
| return [] |
| |
| try: |
| |
| text_truncated = text[:2000] |
| |
| ner_results = self.ner_pipeline(text_truncated) |
| |
| ner_skills = [] |
| for entity in ner_results: |
| group = str(entity.get("entity_group", "")).upper() |
| if group not in {"MISC", "ORG", "SKILL"}: |
| continue |
| if entity.get("score", 0) <= 0.70: |
| continue |
|
|
| skill_name = str(entity.get("word", "")).strip().replace("##", "") |
| skill_name = re.sub(r"\s+", " ", skill_name) |
| if len(skill_name) < 2: |
| continue |
|
|
| ner_skills.append({ |
| "name": skill_name.title(), |
| "source": "NER", |
| "confidence": float(entity.get("score", 0.95)), |
| "category": self._classify_skill(skill_name), |
| "method": f"NER-{self.ner_model_name}" |
| }) |
| |
| return ner_skills |
| except Exception as e: |
| print(f"⚠️ NER extraction error: {e}") |
| return [] |
| |
| def _classify_skill(self, skill_name: str) -> str: |
| """Classify skill into category""" |
| skill_lower = skill_name.lower() |
| |
| |
| category = self.skill_extractor.skill_categories.get(skill_lower, None) |
| if category: |
| return category |
| |
| |
| tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node", |
| "aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb", |
| "api", "rest", "graphql", "fastapi", "django", "flask"] |
| |
| soft_keywords = ["leadership", "communication", "teamwork", "management", "planning", |
| "problem solving", "analytical", "creative", "adaptability"] |
| |
| language_keywords = ["english", "french", "spanish", "german", "italian", "arabic", |
| "portuguese", "mandarin", "japanese"] |
| |
| if any(keyword in skill_lower for keyword in tech_keywords): |
| return "tech" |
| elif any(keyword in skill_lower for keyword in soft_keywords): |
| return "soft" |
| elif any(keyword in skill_lower for keyword in language_keywords): |
| return "language" |
| else: |
| return "tech" |
| |
| def get_extraction_stats(self, skills: List[Dict]) -> Dict: |
| """Get statistics about extraction""" |
| if not skills: |
| return { |
| "total": 0, |
| "by_source": {}, |
| "avg_confidence": 0, |
| "coverage": 0 |
| } |
| |
| by_source = {} |
| for skill in skills: |
| source = skill.get("source", "unknown") |
| by_source[source] = by_source.get(source, 0) + 1 |
| |
| avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills) |
| |
| return { |
| "total": len(skills), |
| "by_source": by_source, |
| "avg_confidence": round(avg_confidence, 3), |
| "top_3": [s["name"] for s in skills[:3]] |
| } |
|
|
|
|
| |
| if __name__ == "__main__": |
| extractor = EnhancedSkillExtractor() |
| |
| sample_text = """ |
| Python developer with 5 years experience. |
| Expert in FastAPI, Django, React, Docker. |
| Strong communication and leadership skills. |
| Fluent in English and French. |
| """ |
| |
| skills = extractor.extract_skills_hybrid(sample_text) |
| stats = extractor.get_extraction_stats(skills) |
| |
| print(f"Found {stats['total']} skills:") |
| for skill in skills: |
| print(f" - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}") |
| |
| print(f"\nStats: {json.dumps(stats, indent=2)}") |
|
|