ai-talent-finder-backend / ai_module /nlp /enhanced_skill_extractor.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
9.79 kB
"""
Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching
Étape 6 Optimization: Hybrid approach for maximum skill coverage
"""
from typing import List, Dict, Optional
import json
import os
import re
try:
from transformers import pipeline
NER_AVAILABLE = True
except ImportError:
NER_AVAILABLE = False
from ai_module.nlp.skill_extractor import SkillExtractor
from ai_module.matching.semantic_matcher import SemanticSkillMatcher
try:
from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor
MULTILINGUAL_EXTRACTOR_AVAILABLE = True
except Exception:
MultilingualSkillExtractor = None
MULTILINGUAL_EXTRACTOR_AVAILABLE = False
class EnhancedSkillExtractor:
"""
Extract skills using hybrid approach:
1. BERT-based NER (95% accuracy)
2. Dictionary fuzzy matching (80% accuracy)
3. Intelligent merge with confidence scores
"""
def __init__(self, load_ner: bool = True):
"""Initialize both extraction methods"""
self.skill_extractor = SkillExtractor()
self.canonical_skills = list(self.skill_extractor.all_skills)
self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62"))
self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER")
self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None
if load_ner and NER_AVAILABLE:
try:
self.ner_pipeline = pipeline(
"ner",
model=self.ner_model_name,
aggregation_strategy="simple"
)
self.ner_available = True
except Exception as e:
print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.")
self.ner_available = False
else:
self.ner_available = False
def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]:
"""
Extract skills using both NER and dictionary methods
Args:
text: CV text
threshold: Fuzzy matching threshold
Returns:
List of skills with source and confidence
[
{"name": "Python", "source": "NER", "confidence": 0.95},
{"name": "React", "source": "DICT-FUZZY", "confidence": 0.80}
]
"""
all_skills = []
seen_skills = set() # Track added skills (lowercase)
# Step 1: Extract via NER (if available)
if self.ner_available:
ner_skills = self._extract_via_ner(text)
for skill_data in ner_skills:
skill_name_lower = skill_data["name"].lower()
if skill_name_lower not in seen_skills:
all_skills.append(skill_data)
seen_skills.add(skill_name_lower)
# Step 2: Extract via Dictionary (for coverage)
dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold)
for dict_skill in dict_skills:
skill_name_lower = dict_skill["name"].lower()
if skill_name_lower not in seen_skills:
# Enhance dictionary skill with additional metadata
skill_data = {
"name": dict_skill["name"],
"source": "DICT-FUZZY",
"confidence": 0.80, # Lower than NER
"category": dict_skill.get("category", "tech"),
"method": dict_skill.get("method", "fuzzy")
}
all_skills.append(skill_data)
seen_skills.add(skill_name_lower)
# Step 2b: Multilingual aliases (FR/EN/ES) for better recall.
if self.multilingual_extractor is not None:
multilingual_skills = self.multilingual_extractor.extract_skills(text)
for skill_data in multilingual_skills:
skill_name_lower = skill_data["name"].lower()
if skill_name_lower not in seen_skills:
all_skills.append(skill_data)
seen_skills.add(skill_name_lower)
# Step 3: Embedding-based normalization to canonical skill list.
normalized = self._normalize_with_embeddings(all_skills)
if normalized:
all_skills = normalized
# Sort by confidence descending
all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True)
return all_skills
def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]:
"""Map extracted skill variants to nearest canonical skills via embeddings."""
if not skills:
return []
normalized: List[Dict] = []
seen = set()
for skill in skills:
raw_name = str(skill.get("name", "")).strip()
if not raw_name:
continue
nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1)
if nearest:
best_name, similarity = nearest[0]
if similarity >= self.semantic_threshold:
skill["normalized_name"] = best_name
skill["normalization_similarity"] = round(similarity, 4)
skill["name"] = best_name
key = str(skill.get("name", "")).lower()
if key in seen:
continue
seen.add(key)
normalized.append(skill)
return normalized
def _extract_via_ner(self, text: str) -> List[Dict]:
"""Extract candidate skill entities from NER pipeline output."""
if not self.ner_available or not text:
return []
try:
# Keep runtime bounded.
text_truncated = text[:2000]
ner_results = self.ner_pipeline(text_truncated)
ner_skills = []
for entity in ner_results:
group = str(entity.get("entity_group", "")).upper()
if group not in {"MISC", "ORG", "SKILL"}:
continue
if entity.get("score", 0) <= 0.70:
continue
skill_name = str(entity.get("word", "")).strip().replace("##", "")
skill_name = re.sub(r"\s+", " ", skill_name)
if len(skill_name) < 2:
continue
ner_skills.append({
"name": skill_name.title(),
"source": "NER",
"confidence": float(entity.get("score", 0.95)),
"category": self._classify_skill(skill_name),
"method": f"NER-{self.ner_model_name}"
})
return ner_skills
except Exception as e:
print(f"⚠️ NER extraction error: {e}")
return []
def _classify_skill(self, skill_name: str) -> str:
"""Classify skill into category"""
skill_lower = skill_name.lower()
# Check in existing skill categories
category = self.skill_extractor.skill_categories.get(skill_lower, None)
if category:
return category
# Smart classification based on keywords
tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node",
"aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb",
"api", "rest", "graphql", "fastapi", "django", "flask"]
soft_keywords = ["leadership", "communication", "teamwork", "management", "planning",
"problem solving", "analytical", "creative", "adaptability"]
language_keywords = ["english", "french", "spanish", "german", "italian", "arabic",
"portuguese", "mandarin", "japanese"]
if any(keyword in skill_lower for keyword in tech_keywords):
return "tech"
elif any(keyword in skill_lower for keyword in soft_keywords):
return "soft"
elif any(keyword in skill_lower for keyword in language_keywords):
return "language"
else:
return "tech" # Default
def get_extraction_stats(self, skills: List[Dict]) -> Dict:
"""Get statistics about extraction"""
if not skills:
return {
"total": 0,
"by_source": {},
"avg_confidence": 0,
"coverage": 0
}
by_source = {}
for skill in skills:
source = skill.get("source", "unknown")
by_source[source] = by_source.get(source, 0) + 1
avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills)
return {
"total": len(skills),
"by_source": by_source,
"avg_confidence": round(avg_confidence, 3),
"top_3": [s["name"] for s in skills[:3]]
}
# Usage example
if __name__ == "__main__":
extractor = EnhancedSkillExtractor()
sample_text = """
Python developer with 5 years experience.
Expert in FastAPI, Django, React, Docker.
Strong communication and leadership skills.
Fluent in English and French.
"""
skills = extractor.extract_skills_hybrid(sample_text)
stats = extractor.get_extraction_stats(skills)
print(f"Found {stats['total']} skills:")
for skill in skills:
print(f" - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}")
print(f"\nStats: {json.dumps(stats, indent=2)}")