Spaces:

RHmaster
/

ai-talent-finder-backend

Sleeping

ai-talent-finder-backend / ai_module /nlp /enhanced_skill_extractor.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 11 days ago

9.79 kB

	"""
	Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching
	Étape 6 Optimization: Hybrid approach for maximum skill coverage
	"""

	from typing import List, Dict, Optional
	import json
	import os
	import re

	try:
	from transformers import pipeline
	NER_AVAILABLE = True
	except ImportError:
	NER_AVAILABLE = False

	from ai_module.nlp.skill_extractor import SkillExtractor
	from ai_module.matching.semantic_matcher import SemanticSkillMatcher

	try:
	from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor
	MULTILINGUAL_EXTRACTOR_AVAILABLE = True
	except Exception:
	MultilingualSkillExtractor = None
	MULTILINGUAL_EXTRACTOR_AVAILABLE = False


	class EnhancedSkillExtractor:
	"""
	Extract skills using hybrid approach:
	1. BERT-based NER (95% accuracy)
	2. Dictionary fuzzy matching (80% accuracy)
	3. Intelligent merge with confidence scores
	"""

	def __init__(self, load_ner: bool = True):
	"""Initialize both extraction methods"""
	self.skill_extractor = SkillExtractor()
	self.canonical_skills = list(self.skill_extractor.all_skills)
	self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62"))
	self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER")
	self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None

	if load_ner and NER_AVAILABLE:
	try:
	self.ner_pipeline = pipeline(
	"ner",
	model=self.ner_model_name,
	aggregation_strategy="simple"
	)
	self.ner_available = True
	except Exception as e:
	print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.")
	self.ner_available = False
	else:
	self.ner_available = False

	def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]:
	"""
	Extract skills using both NER and dictionary methods

	Args:
	text: CV text
	threshold: Fuzzy matching threshold

	Returns:
	List of skills with source and confidence
	[
	{"name": "Python", "source": "NER", "confidence": 0.95},
	{"name": "React", "source": "DICT-FUZZY", "confidence": 0.80}
	]
	"""
	all_skills = []
	seen_skills = set() # Track added skills (lowercase)

	# Step 1: Extract via NER (if available)
	if self.ner_available:
	ner_skills = self._extract_via_ner(text)
	for skill_data in ner_skills:
	skill_name_lower = skill_data["name"].lower()
	if skill_name_lower not in seen_skills:
	all_skills.append(skill_data)
	seen_skills.add(skill_name_lower)

	# Step 2: Extract via Dictionary (for coverage)
	dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold)
	for dict_skill in dict_skills:
	skill_name_lower = dict_skill["name"].lower()
	if skill_name_lower not in seen_skills:
	# Enhance dictionary skill with additional metadata
	skill_data = {
	"name": dict_skill["name"],
	"source": "DICT-FUZZY",
	"confidence": 0.80, # Lower than NER
	"category": dict_skill.get("category", "tech"),
	"method": dict_skill.get("method", "fuzzy")
	}
	all_skills.append(skill_data)
	seen_skills.add(skill_name_lower)

	# Step 2b: Multilingual aliases (FR/EN/ES) for better recall.
	if self.multilingual_extractor is not None:
	multilingual_skills = self.multilingual_extractor.extract_skills(text)
	for skill_data in multilingual_skills:
	skill_name_lower = skill_data["name"].lower()
	if skill_name_lower not in seen_skills:
	all_skills.append(skill_data)
	seen_skills.add(skill_name_lower)

	# Step 3: Embedding-based normalization to canonical skill list.
	normalized = self._normalize_with_embeddings(all_skills)
	if normalized:
	all_skills = normalized

	# Sort by confidence descending
	all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True)

	return all_skills

	def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]:
	"""Map extracted skill variants to nearest canonical skills via embeddings."""
	if not skills:
	return []

	normalized: List[Dict] = []
	seen = set()

	for skill in skills:
	raw_name = str(skill.get("name", "")).strip()
	if not raw_name:
	continue

	nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1)
	if nearest:
	best_name, similarity = nearest[0]
	if similarity >= self.semantic_threshold:
	skill["normalized_name"] = best_name
	skill["normalization_similarity"] = round(similarity, 4)
	skill["name"] = best_name

	key = str(skill.get("name", "")).lower()
	if key in seen:
	continue
	seen.add(key)
	normalized.append(skill)

	return normalized

	def _extract_via_ner(self, text: str) -> List[Dict]:
	"""Extract candidate skill entities from NER pipeline output."""
	if not self.ner_available or not text:
	return []

	try:
	# Keep runtime bounded.
	text_truncated = text[:2000]

	ner_results = self.ner_pipeline(text_truncated)

	ner_skills = []
	for entity in ner_results:
	group = str(entity.get("entity_group", "")).upper()
	if group not in {"MISC", "ORG", "SKILL"}:
	continue
	if entity.get("score", 0) <= 0.70:
	continue

	skill_name = str(entity.get("word", "")).strip().replace("##", "")
	skill_name = re.sub(r"\s+", " ", skill_name)
	if len(skill_name) < 2:
	continue

	ner_skills.append({
	"name": skill_name.title(),
	"source": "NER",
	"confidence": float(entity.get("score", 0.95)),
	"category": self._classify_skill(skill_name),
	"method": f"NER-{self.ner_model_name}"
	})

	return ner_skills
	except Exception as e:
	print(f"⚠️ NER extraction error: {e}")
	return []

	def _classify_skill(self, skill_name: str) -> str:
	"""Classify skill into category"""
	skill_lower = skill_name.lower()

	# Check in existing skill categories
	category = self.skill_extractor.skill_categories.get(skill_lower, None)
	if category:
	return category

	# Smart classification based on keywords
	tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node",
	"aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb",
	"api", "rest", "graphql", "fastapi", "django", "flask"]

	soft_keywords = ["leadership", "communication", "teamwork", "management", "planning",
	"problem solving", "analytical", "creative", "adaptability"]

	language_keywords = ["english", "french", "spanish", "german", "italian", "arabic",
	"portuguese", "mandarin", "japanese"]

	if any(keyword in skill_lower for keyword in tech_keywords):
	return "tech"
	elif any(keyword in skill_lower for keyword in soft_keywords):
	return "soft"
	elif any(keyword in skill_lower for keyword in language_keywords):
	return "language"
	else:
	return "tech" # Default

	def get_extraction_stats(self, skills: List[Dict]) -> Dict:
	"""Get statistics about extraction"""
	if not skills:
	return {
	"total": 0,
	"by_source": {},
	"avg_confidence": 0,
	"coverage": 0
	}

	by_source = {}
	for skill in skills:
	source = skill.get("source", "unknown")
	by_source[source] = by_source.get(source, 0) + 1

	avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills)

	return {
	"total": len(skills),
	"by_source": by_source,
	"avg_confidence": round(avg_confidence, 3),
	"top_3": [s["name"] for s in skills[:3]]
	}


	# Usage example
	if __name__ == "__main__":
	extractor = EnhancedSkillExtractor()

	sample_text = """
	Python developer with 5 years experience.
	Expert in FastAPI, Django, React, Docker.
	Strong communication and leadership skills.
	Fluent in English and French.
	"""

	skills = extractor.extract_skills_hybrid(sample_text)
	stats = extractor.get_extraction_stats(skills)

	print(f"Found {stats['total']} skills:")
	for skill in skills:
	print(f" - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}")

	print(f"\nStats: {json.dumps(stats, indent=2)}")