Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / scripts /phase1_implementation_starter.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 10 days ago

21.8 kB

	#!/usr/bin/env python3
	"""
	PHASE 1 IMPLEMENTATION STARTER — Quick Wins

	Trois implémentations concrètes pour démarrer immédiatement:
	1. Adaptive Thresholds (1-2 hours)
	2. Smart Deduplication (1-2 hours)
	3. Enhanced Explainability (2-3 hours)

	Prêt à copy-paste dans votre codebase.
	"""

	# ============================================================================
	# 1. ADAPTIVE THRESHOLDS (backend/ai_module/matching/adaptive_thresholds.py)
	# ============================================================================

	from typing import Dict, Tuple
	import re
	from datetime import datetime

	class AdaptiveThresholdEngine:
	"""
	Sélectionne thresholds de matching basé sur le domaine du job.

	Usage:
	engine = AdaptiveThresholdEngine()
	thresholds = engine.get_thresholds("Senior Data Scientist")
	# → {"accept": 0.75, "review": 0.45}
	"""

	# Définition des domains et leurs thresholds
	DOMAIN_CONFIG = {
	# Data Science: Bar très haute (rare talent)
	"data_science": {
	"accept": 0.75,
	"review": 0.45,
	"confidence": 0.90,
	"description": "High bar - specialized domain"
	},
	# Finance: Sécurité prioritaire
	"finance": {
	"accept": 0.80,
	"review": 0.50,
	"confidence": 0.85,
	"description": "Very strict - compliance critical"
	},
	# Backend/DevOps: Mixed skills okay
	"backend": {
	"accept": 0.70,
	"review": 0.40,
	"confidence": 0.85,
	"description": "Moderate - diverse tech ok"
	},
	# Frontend: Creative + technical
	"frontend": {
	"accept": 0.70,
	"review": 0.38,
	"confidence": 0.80,
	"description": "Moderate - UX skills valuable"
	},
	# Startup: Flexibility high
	"startup": {
	"accept": 0.60,
	"review": 0.30,
	"confidence": 0.75,
	"description": "Low bar - versatility valued"
	},
	# Product/PM: Soft skills important
	"product": {
	"accept": 0.65,
	"review": 0.35,
	"confidence": 0.80,
	"description": "Moderate - soft skills matter"
	},
	# Sales/Marketing: Personality + skills
	"business": {
	"accept": 0.60,
	"review": 0.32,
	"confidence": 0.75,
	"description": "Low bar - personality critical"
	},
	# ML/AI: Highly specialized
	"machine_learning": {
	"accept": 0.78,
	"review": 0.48,
	"confidence": 0.90,
	"description": "High bar - specialized"
	},
	# DevOps/Infrastructure
	"devops": {
	"accept": 0.72,
	"review": 0.42,
	"confidence": 0.85,
	"description": "High bar - reliability critical"
	},
	# Default fallback
	"default": {
	"accept": 0.80,
	"review": 0.50,
	"confidence": 0.80,
	"description": "Standard thresholds"
	}
	}

	# Keywords pour la détection de domaines
	DOMAIN_KEYWORDS = {
	"data_science": [
	"data scientist", "data science", "analytics", "statistical",
	"machine learning", "ml engineer", "data engineer", "big data"
	],
	"finance": [
	"financial", "accountant", "trader", "analyst", "finance",
	"risk", "banking", "investment", "portfolio"
	],
	"backend": [
	"backend", "server", "api", "python", "java", "golang",
	"infrastructure", "architect", "systems engineer", "performance"
	],
	"frontend": [
	"frontend", "ui", "ux", "react", "vue", "angular",
	"web developer", "designer", "visual", "css", "javascript"
	],
	"startup": [
	"startups", "founder", "early stage", "mvp", "bootstrapped",
	"rapid", "agile", "full stack", "jack of all trades"
	],
	"product": [
	"product manager", "pm", "product owner", "po",
	"roadmap", "strategy", "vision", "user experience"
	],
	"business": [
	"sales", "business development", "marketing", "bd",
	"account manager", "customer", "commercial", "partnership"
	],
	"machine_learning": [
	"machine learning", "ml", "deep learning", "neural",
	"tensorflow", "pytorch", "ai engineer", "ai scientist"
	],
	"devops": [
	"devops", "sre", "kubernetes", "docker", "infrastructure",
	"ci/cd", "deployment", "cloud", "aws", "gcp", "azure"
	],
	}

	def detect_domain(self, job_title: str) -> str:
	"""
	Détecte le domaine du job à partir du titre.

	Args:
	job_title: Ex "Senior Data Scientist"

	Returns:
	domain slug: Ex "data_science"
	"""
	if not job_title:
	return "default"

	job_lower = job_title.lower()

	# Score chaque domain par nombre de keywords matchés
	domain_scores = {}
	for domain, keywords in self.DOMAIN_KEYWORDS.items():
	score = sum(1 for kw in keywords if kw in job_lower)
	domain_scores[domain] = score

	# Retourner le domain avec plus de matches
	best_domain = max(domain_scores.items(), key=lambda x: x[1])[0]

	# Si aucun match trouvé, utiliser default
	if domain_scores[best_domain] == 0:
	return "default"

	return best_domain

	def get_thresholds(self, job_title: str) -> Dict[str, float]:
	"""
	Retourne les thresholds adaptatifs pour un job.

	Args:
	job_title: Titre du job

	Returns:
	{"accept": 0.70, "review": 0.40, "confidence": 0.85}
	"""
	domain = self.detect_domain(job_title)
	config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])

	# Filtering for output (sans description)
	return {
	"accept": config["accept"],
	"review": config["review"],
	"confidence": config["confidence"],
	}

	def get_thresholds_with_explanation(self, job_title: str) -> Dict:
	"""Retourne thresholds + explication du domain détecté."""
	domain = self.detect_domain(job_title)
	config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])

	return {
	"domain": domain,
	"job_title": job_title,
	"thresholds": {
	"accept": config["accept"],
	"review": config["review"],
	"confidence": config["confidence"],
	},
	"rationale": config["description"],
	"detected_at": datetime.utcnow().isoformat(),
	}


	# ============================================================================
	# 2. SMART DEDUPLICATION (backend/ai_module/nlp/smart_dedup.py)
	# ============================================================================

	from typing import List, Set
	from numpy import ndarray
	from sklearn.metrics.pairwise import cosine_similarity
	import numpy as np

	class SmartSkillDeduplicator:
	"""
	Déduplique skills avec similarité sémantique au lieu de string matching.

	Exemple:
	dedup = SmartSkillDeduplicator(embedder)
	result = dedup.deduplicate(["Python", "python", "Python 3.11"])
	# → ["Python"]
	"""

	def __init__(self, embedder=None, similarity_threshold: float = 0.82):
	"""
	Args:
	embedder: SentenceTransformer instance (ou sera créée)
	similarity_threshold: Min similarity pour merger (0.0-1.0)
	"""
	self.embedder = embedder
	self.similarity_threshold = similarity_threshold

	def deduplicate(self, skills: List[str]) -> List[str]:
	"""
	Déduplique une liste de skills via clustering sémantique.

	Args:
	skills: ["Python", "python", "ML", "Machine Learning"]

	Returns:
	["Python", "Machine Learning"] # Canonical names
	"""
	if not skills:
	return []

	# Cas simple: si ≤1 skills, retourner as is
	if len(skills) <= 1:
	return skills

	# Normaliser (lowercase, trim)
	normalized = [s.strip().lower() for s in skills]

	# Première pass: exact string dedup
	first_pass = list(dict.fromkeys(normalized)) # Preserve order, remove dupes

	if len(first_pass) <= 1:
	return first_pass

	# Deuxième pass: semantic clustering
	if self.embedder:
	try:
	clusters = self._cluster_by_similarity(first_pass)
	canonical = self._extract_canonical(skills, clusters)
	return canonical
	except Exception as e:
	# Fallback to first pass if embedding fails
	print(f"Warning: Embedding failed ({e}), using string dedup")
	return first_pass

	return first_pass

	def _cluster_by_similarity(self, skills: List[str]) -> List[List[int]]:
	"""
	Cluster skills indices basé sur similarité sémantique.

	Returns:
	[[0, 1], [2, 3]] # Indices of skills that cluster together
	"""
	# Générer embeddings
	embeddings = self.embedder.encode(skills) # shape: (N, 384)

	# Calculer matrice similarité
	similarity_matrix = cosine_similarity(embeddings) # shape: (N, N)

	# Clustering via connected components
	clusters = []
	used = set()

	for i in range(len(skills)):
	if i in used:
	continue

	# Commencer nouveau cluster
	cluster = [i]
	used.add(i)

	# Trouver tous les skills similaires
	for j in range(i + 1, len(skills)):
	if j in used:
	continue

	if similarity_matrix[i][j] > self.similarity_threshold:
	cluster.append(j)
	used.add(j)

	clusters.append(cluster)

	return clusters

	def _extract_canonical(self, original_skills: List[str],
	clusters: List[List[int]]) -> List[str]:
	"""
	Extrait skill canonical pour chaque cluster.

	Heuristique: skill le plus long (plus descriptif)
	"""
	canonical = []

	for cluster in clusters:
	# Prendre le skill original (preserving case/format)
	cluster_skills = [original_skills[i] for i in cluster]

	# Heuristique: skill le plus long = most descriptive
	canonical_skill = max(cluster_skills, key=len)
	canonical.append(canonical_skill)

	return canonical


	# ============================================================================
	# 3. ENHANCED EXPLAINABILITY (backend/ai_module/matching/explainability.py)
	# ============================================================================

	from datetime import datetime
	from enum import Enum

	class SkillMatchStatus(str, Enum):
	MATCHED = "matched"
	MISSING = "missing"
	BONUS = "bonus"

	class ExplainabilityEngine:
	"""
	Explique chaque composant du score de matching de façon détaillée.

	Retourne:
	{
	"total_score": 0.847,
	"components": {...},
	"strengths": [...],
	"gaps": [...],
	"recommendation": "Proceed to interview",
	"confidence": 0.92
	}
	"""

	# Pondérations du scoring
	WEIGHTS = {
	"skills": 0.50,
	"semantic": 0.20,
	"experience": 0.15,
	"education": 0.10,
	"bonus": 0.05,
	}

	def explain_score(self, candidate, criteria, total_score: float) -> Dict:
	"""
	Génère explication complète du score.

	Args:
	candidate: Candidate model instance
	criteria: JobCriteria model instance
	total_score: Match score (0.0-1.0)

	Returns:
	Explication structurée
	"""

	# Composer les différentes évaluations
	skills_breakdown = self._explain_skills(candidate, criteria)
	semantic_breakdown = self._explain_semantic(candidate, criteria)
	experience_breakdown = self._explain_experience(candidate, criteria)
	education_breakdown = self._explain_education(candidate, criteria)

	# Identifier forces et faiblesses
	strengths = self._identify_strengths(skills_breakdown)
	gaps = self._identify_gaps(skills_breakdown)

	# Recommandation
	recommendation = self._recommend_action(total_score)

	# Confiance du score
	confidence = self._calculate_confidence(
	candidate, criteria, skills_breakdown
	)

	return {
	"timestamp": datetime.utcnow().isoformat(),
	"candidate": {
	"id": candidate.id,
	"name": candidate.full_name,
	"email": candidate.email,
	},
	"criteria": {
	"id": criteria.id,
	"title": criteria.title,
	},
	"score": {
	"total": round(total_score, 3),
	"percentage": f"{total_score*100:.1f}%",
	"components": {
	"skills": round(skills_breakdown["score"], 3),
	"semantic": round(semantic_breakdown["score"], 3),
	"experience": round(experience_breakdown["score"], 3),
	"education": round(education_breakdown["score"], 3),
	},
	},
	"breakdown": {
	"skills": skills_breakdown,
	"semantic": semantic_breakdown,
	"experience": experience_breakdown,
	"education": education_breakdown,
	},
	"insights": {
	"strengths": strengths,
	"gaps": gaps,
	},
	"decision": {
	"recommendation": recommendation["action"],
	"rationale": recommendation["rationale"],
	},
	"confidence": confidence,
	}

	def _explain_skills(self, candidate, criteria) -> Dict:
	"""Détail du matching de skills."""
	matched = []
	missing = []

	# Récupérer skills du candidat
	candidate_skill_names = {
	s.skill.name.lower(): s.skill.name
	for s in candidate.candidate_skills
	if s.skill
	}

	# Comparer vs criteria
	total_weight = sum(c.weight for c in criteria.criteria_skills) or 100

	for criterion in criteria.criteria_skills:
	if not criterion.skill:
	continue

	skill_name = criterion.skill.name
	is_present = skill_name.lower() in candidate_skill_names

	contribution = (criterion.weight / total_weight) if is_present else 0

	skill_info = {
	"skill": skill_name,
	"weight": criterion.weight,
	"status": SkillMatchStatus.MATCHED if is_present else SkillMatchStatus.MISSING,
	"contribution": round(contribution * 0.50, 3), # 50% weight de skills
	}

	if is_present:
	matched.append(skill_info)
	else:
	missing.append(skill_info)

	score = len(matched) / max(1, len(matched) + len(missing))

	return {
	"score": score,
	"matched": matched,
	"missing": missing,
	"coverage": f"{len(matched)}/{len(matched) + len(missing)} core skills",
	"summary": f"Matched {len(matched)}/{len(matched) + len(missing)} required skills"
	}

	def _explain_semantic(self, candidate, criteria) -> Dict:
	"""Similarité sémantique CV vs job description."""
	# Simplifié pour exemple
	return {
	"score": 0.75,
	"reason": "Strong alignment with job description keywords",
	"keywords_matched": ["python", "leadership", "frontend"],
	"keywords_missing": ["kubernetes"],
	}

	def _explain_experience(self, candidate, criteria) -> Dict:
	"""Évaluation expérience."""
	years = candidate.years_experience or 0
	return {
	"score": min(years / 10.0, 1.0), # Cap at 1.0
	"years": years,
	"assessment": "Senior level" if years >= 5 else "Junior-Mid level",
	}

	def _explain_education(self, candidate, criteria) -> Dict:
	"""Évaluation éducation."""
	return {
	"score": 0.8,
	"degree": candidate.extracted_education or "Not specified",
	"assessment": "Relevant background",
	}

	def _identify_strengths(self, skills_breakdown: Dict) -> List[str]:
	"""Identifie top forces."""
	matched = skills_breakdown.get("matched", [])
	if not matched:
	return []

	# Top 3 par contribution
	top = sorted(matched, key=lambda x: x["weight"], reverse=True)[:3]
	return [f"{s['skill']} ({s['weight']}%)" for s in top]

	def _identify_gaps(self, skills_breakdown: Dict) -> List[str]:
	"""Identifie top gaps."""
	missing = skills_breakdown.get("missing", [])
	if not missing:
	return []

	# Top 3 par weight
	top = sorted(missing, key=lambda x: x["weight"], reverse=True)[:3]
	return [f"{s['skill']} ({s['weight']}%)" for s in top]

	def _recommend_action(self, score: float) -> Dict:
	"""Recommandation basée sur score."""
	if score >= 0.80:
	return {
	"action": "ACCEPT - Interview now",
	"rationale": "Strong match on core criteria",
	"confidence": "High"
	}
	elif score >= 0.50:
	return {
	"action": "REVIEW - Phone screen first",
	"rationale": "Good match but verify specific skills",
	"confidence": "Medium"
	}
	else:
	return {
	"action": "PASS - Not aligned",
	"rationale": "Missing too many core skills",
	"confidence": "High"
	}

	def _calculate_confidence(self, candidate, criteria, skills_breakdown: Dict) -> float:
	"""Confiance du scoring (0.0-1.0)."""
	confidence = 0.8 # Base

	# Penalize si peu de skills dans criteria
	if len(criteria.criteria_skills) < 3:
	confidence *= 0.7

	# Boost si tous skills matchent
	if len(skills_breakdown["missing"]) == 0:
	confidence = min(confidence * 1.1, 1.0)

	return round(confidence, 2)


	# ============================================================================
	# USAGE EXAMPLES
	# ============================================================================

	if __name__ == "__main__":
	print("=" * 70)
	print("Phase 1 Implementation Examples")
	print("=" * 70)

	# 1. Adaptive Thresholds
	print("\n1️⃣ ADAPTIVE THRESHOLDS")
	print("-" * 70)

	threshold_engine = AdaptiveThresholdEngine()

	test_jobs = [
	"Senior Data Scientist",
	"Financial Analyst",
	"Startup Full Stack Developer",
	]

	for job in test_jobs:
	result = threshold_engine.get_thresholds_with_explanation(job)
	print(f"\nJob: {result['job_title']}")
	print(f"Domain: {result['domain']}")
	print(f"Thresholds: Accept={result['thresholds']['accept']:.0%}, Review={result['thresholds']['review']:.0%}")
	print(f"Rationale: {result['rationale']}")

	# 2. Smart Deduplication
	print("\n\n2️⃣ SMART DEDUPLICATION")
	print("-" * 70)

	dedup = SmartSkillDeduplicator(similarity_threshold=0.82)

	test_skills = [
	["Python", "python", "python3"],
	["JavaScript", "JS", "Node.js", "TypeScript"],
	["Data Analysis", "Analytics", "Data Analytics"],
	]

	for skills in test_skills:
	result = dedup.deduplicate(skills)
	print(f"\nInput: {skills}")
	print(f"Output: {result}")

	# 3. Explainability (example structure)
	print("\n\n3️⃣ ENHANCED EXPLAINABILITY")
	print("-" * 70)
	print("\nExample output structure:")
	print("""
	{
	"timestamp": "2026-05-12T23:50:00.000000",
	"candidate": {
	"id": 1,
	"name": "Ahmed Ben",
	"email": "ahmed@example.com"
	},
	"score": {
	"total": 0.847,
	"percentage": "84.7%",
	"components": {
	"skills": 0.85,
	"semantic": 0.72,
	"experience": 0.9,
	"education": 0.8
	}
	},
	"insights": {
	"strengths": ["Python (25%)", "Leadership (20%)", "Cloud (15%)"],
	"gaps": ["Kubernetes (15%)", "DevOps (10%)"]
	},
	"decision": {
	"recommendation": "ACCEPT - Interview now",
	"rationale": "Strong match on core criteria",
	"confidence": 0.92
	}
	}
	""")

	print("\n" + "=" * 70)
	print("✅ Phase 1 Examples Complete")
	print("=" * 70)