ai-talent-finder-backend / scripts /phase1_implementation_starter.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
21.8 kB
#!/usr/bin/env python3
"""
PHASE 1 IMPLEMENTATION STARTER — Quick Wins
Trois implémentations concrètes pour démarrer immédiatement:
1. Adaptive Thresholds (1-2 hours)
2. Smart Deduplication (1-2 hours)
3. Enhanced Explainability (2-3 hours)
Prêt à copy-paste dans votre codebase.
"""
# ============================================================================
# 1. ADAPTIVE THRESHOLDS (backend/ai_module/matching/adaptive_thresholds.py)
# ============================================================================
from typing import Dict, Tuple
import re
from datetime import datetime
class AdaptiveThresholdEngine:
"""
Sélectionne thresholds de matching basé sur le domaine du job.
Usage:
engine = AdaptiveThresholdEngine()
thresholds = engine.get_thresholds("Senior Data Scientist")
# → {"accept": 0.75, "review": 0.45}
"""
# Définition des domains et leurs thresholds
DOMAIN_CONFIG = {
# Data Science: Bar très haute (rare talent)
"data_science": {
"accept": 0.75,
"review": 0.45,
"confidence": 0.90,
"description": "High bar - specialized domain"
},
# Finance: Sécurité prioritaire
"finance": {
"accept": 0.80,
"review": 0.50,
"confidence": 0.85,
"description": "Very strict - compliance critical"
},
# Backend/DevOps: Mixed skills okay
"backend": {
"accept": 0.70,
"review": 0.40,
"confidence": 0.85,
"description": "Moderate - diverse tech ok"
},
# Frontend: Creative + technical
"frontend": {
"accept": 0.70,
"review": 0.38,
"confidence": 0.80,
"description": "Moderate - UX skills valuable"
},
# Startup: Flexibility high
"startup": {
"accept": 0.60,
"review": 0.30,
"confidence": 0.75,
"description": "Low bar - versatility valued"
},
# Product/PM: Soft skills important
"product": {
"accept": 0.65,
"review": 0.35,
"confidence": 0.80,
"description": "Moderate - soft skills matter"
},
# Sales/Marketing: Personality + skills
"business": {
"accept": 0.60,
"review": 0.32,
"confidence": 0.75,
"description": "Low bar - personality critical"
},
# ML/AI: Highly specialized
"machine_learning": {
"accept": 0.78,
"review": 0.48,
"confidence": 0.90,
"description": "High bar - specialized"
},
# DevOps/Infrastructure
"devops": {
"accept": 0.72,
"review": 0.42,
"confidence": 0.85,
"description": "High bar - reliability critical"
},
# Default fallback
"default": {
"accept": 0.80,
"review": 0.50,
"confidence": 0.80,
"description": "Standard thresholds"
}
}
# Keywords pour la détection de domaines
DOMAIN_KEYWORDS = {
"data_science": [
"data scientist", "data science", "analytics", "statistical",
"machine learning", "ml engineer", "data engineer", "big data"
],
"finance": [
"financial", "accountant", "trader", "analyst", "finance",
"risk", "banking", "investment", "portfolio"
],
"backend": [
"backend", "server", "api", "python", "java", "golang",
"infrastructure", "architect", "systems engineer", "performance"
],
"frontend": [
"frontend", "ui", "ux", "react", "vue", "angular",
"web developer", "designer", "visual", "css", "javascript"
],
"startup": [
"startups", "founder", "early stage", "mvp", "bootstrapped",
"rapid", "agile", "full stack", "jack of all trades"
],
"product": [
"product manager", "pm", "product owner", "po",
"roadmap", "strategy", "vision", "user experience"
],
"business": [
"sales", "business development", "marketing", "bd",
"account manager", "customer", "commercial", "partnership"
],
"machine_learning": [
"machine learning", "ml", "deep learning", "neural",
"tensorflow", "pytorch", "ai engineer", "ai scientist"
],
"devops": [
"devops", "sre", "kubernetes", "docker", "infrastructure",
"ci/cd", "deployment", "cloud", "aws", "gcp", "azure"
],
}
def detect_domain(self, job_title: str) -> str:
"""
Détecte le domaine du job à partir du titre.
Args:
job_title: Ex "Senior Data Scientist"
Returns:
domain slug: Ex "data_science"
"""
if not job_title:
return "default"
job_lower = job_title.lower()
# Score chaque domain par nombre de keywords matchés
domain_scores = {}
for domain, keywords in self.DOMAIN_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in job_lower)
domain_scores[domain] = score
# Retourner le domain avec plus de matches
best_domain = max(domain_scores.items(), key=lambda x: x[1])[0]
# Si aucun match trouvé, utiliser default
if domain_scores[best_domain] == 0:
return "default"
return best_domain
def get_thresholds(self, job_title: str) -> Dict[str, float]:
"""
Retourne les thresholds adaptatifs pour un job.
Args:
job_title: Titre du job
Returns:
{"accept": 0.70, "review": 0.40, "confidence": 0.85}
"""
domain = self.detect_domain(job_title)
config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])
# Filtering for output (sans description)
return {
"accept": config["accept"],
"review": config["review"],
"confidence": config["confidence"],
}
def get_thresholds_with_explanation(self, job_title: str) -> Dict:
"""Retourne thresholds + explication du domain détecté."""
domain = self.detect_domain(job_title)
config = self.DOMAIN_CONFIG.get(domain, self.DOMAIN_CONFIG["default"])
return {
"domain": domain,
"job_title": job_title,
"thresholds": {
"accept": config["accept"],
"review": config["review"],
"confidence": config["confidence"],
},
"rationale": config["description"],
"detected_at": datetime.utcnow().isoformat(),
}
# ============================================================================
# 2. SMART DEDUPLICATION (backend/ai_module/nlp/smart_dedup.py)
# ============================================================================
from typing import List, Set
from numpy import ndarray
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
class SmartSkillDeduplicator:
"""
Déduplique skills avec similarité sémantique au lieu de string matching.
Exemple:
dedup = SmartSkillDeduplicator(embedder)
result = dedup.deduplicate(["Python", "python", "Python 3.11"])
# → ["Python"]
"""
def __init__(self, embedder=None, similarity_threshold: float = 0.82):
"""
Args:
embedder: SentenceTransformer instance (ou sera créée)
similarity_threshold: Min similarity pour merger (0.0-1.0)
"""
self.embedder = embedder
self.similarity_threshold = similarity_threshold
def deduplicate(self, skills: List[str]) -> List[str]:
"""
Déduplique une liste de skills via clustering sémantique.
Args:
skills: ["Python", "python", "ML", "Machine Learning"]
Returns:
["Python", "Machine Learning"] # Canonical names
"""
if not skills:
return []
# Cas simple: si ≤1 skills, retourner as is
if len(skills) <= 1:
return skills
# Normaliser (lowercase, trim)
normalized = [s.strip().lower() for s in skills]
# Première pass: exact string dedup
first_pass = list(dict.fromkeys(normalized)) # Preserve order, remove dupes
if len(first_pass) <= 1:
return first_pass
# Deuxième pass: semantic clustering
if self.embedder:
try:
clusters = self._cluster_by_similarity(first_pass)
canonical = self._extract_canonical(skills, clusters)
return canonical
except Exception as e:
# Fallback to first pass if embedding fails
print(f"Warning: Embedding failed ({e}), using string dedup")
return first_pass
return first_pass
def _cluster_by_similarity(self, skills: List[str]) -> List[List[int]]:
"""
Cluster skills indices basé sur similarité sémantique.
Returns:
[[0, 1], [2, 3]] # Indices of skills that cluster together
"""
# Générer embeddings
embeddings = self.embedder.encode(skills) # shape: (N, 384)
# Calculer matrice similarité
similarity_matrix = cosine_similarity(embeddings) # shape: (N, N)
# Clustering via connected components
clusters = []
used = set()
for i in range(len(skills)):
if i in used:
continue
# Commencer nouveau cluster
cluster = [i]
used.add(i)
# Trouver tous les skills similaires
for j in range(i + 1, len(skills)):
if j in used:
continue
if similarity_matrix[i][j] > self.similarity_threshold:
cluster.append(j)
used.add(j)
clusters.append(cluster)
return clusters
def _extract_canonical(self, original_skills: List[str],
clusters: List[List[int]]) -> List[str]:
"""
Extrait skill canonical pour chaque cluster.
Heuristique: skill le plus long (plus descriptif)
"""
canonical = []
for cluster in clusters:
# Prendre le skill original (preserving case/format)
cluster_skills = [original_skills[i] for i in cluster]
# Heuristique: skill le plus long = most descriptive
canonical_skill = max(cluster_skills, key=len)
canonical.append(canonical_skill)
return canonical
# ============================================================================
# 3. ENHANCED EXPLAINABILITY (backend/ai_module/matching/explainability.py)
# ============================================================================
from datetime import datetime
from enum import Enum
class SkillMatchStatus(str, Enum):
MATCHED = "matched"
MISSING = "missing"
BONUS = "bonus"
class ExplainabilityEngine:
"""
Explique chaque composant du score de matching de façon détaillée.
Retourne:
{
"total_score": 0.847,
"components": {...},
"strengths": [...],
"gaps": [...],
"recommendation": "Proceed to interview",
"confidence": 0.92
}
"""
# Pondérations du scoring
WEIGHTS = {
"skills": 0.50,
"semantic": 0.20,
"experience": 0.15,
"education": 0.10,
"bonus": 0.05,
}
def explain_score(self, candidate, criteria, total_score: float) -> Dict:
"""
Génère explication complète du score.
Args:
candidate: Candidate model instance
criteria: JobCriteria model instance
total_score: Match score (0.0-1.0)
Returns:
Explication structurée
"""
# Composer les différentes évaluations
skills_breakdown = self._explain_skills(candidate, criteria)
semantic_breakdown = self._explain_semantic(candidate, criteria)
experience_breakdown = self._explain_experience(candidate, criteria)
education_breakdown = self._explain_education(candidate, criteria)
# Identifier forces et faiblesses
strengths = self._identify_strengths(skills_breakdown)
gaps = self._identify_gaps(skills_breakdown)
# Recommandation
recommendation = self._recommend_action(total_score)
# Confiance du score
confidence = self._calculate_confidence(
candidate, criteria, skills_breakdown
)
return {
"timestamp": datetime.utcnow().isoformat(),
"candidate": {
"id": candidate.id,
"name": candidate.full_name,
"email": candidate.email,
},
"criteria": {
"id": criteria.id,
"title": criteria.title,
},
"score": {
"total": round(total_score, 3),
"percentage": f"{total_score*100:.1f}%",
"components": {
"skills": round(skills_breakdown["score"], 3),
"semantic": round(semantic_breakdown["score"], 3),
"experience": round(experience_breakdown["score"], 3),
"education": round(education_breakdown["score"], 3),
},
},
"breakdown": {
"skills": skills_breakdown,
"semantic": semantic_breakdown,
"experience": experience_breakdown,
"education": education_breakdown,
},
"insights": {
"strengths": strengths,
"gaps": gaps,
},
"decision": {
"recommendation": recommendation["action"],
"rationale": recommendation["rationale"],
},
"confidence": confidence,
}
def _explain_skills(self, candidate, criteria) -> Dict:
"""Détail du matching de skills."""
matched = []
missing = []
# Récupérer skills du candidat
candidate_skill_names = {
s.skill.name.lower(): s.skill.name
for s in candidate.candidate_skills
if s.skill
}
# Comparer vs criteria
total_weight = sum(c.weight for c in criteria.criteria_skills) or 100
for criterion in criteria.criteria_skills:
if not criterion.skill:
continue
skill_name = criterion.skill.name
is_present = skill_name.lower() in candidate_skill_names
contribution = (criterion.weight / total_weight) if is_present else 0
skill_info = {
"skill": skill_name,
"weight": criterion.weight,
"status": SkillMatchStatus.MATCHED if is_present else SkillMatchStatus.MISSING,
"contribution": round(contribution * 0.50, 3), # 50% weight de skills
}
if is_present:
matched.append(skill_info)
else:
missing.append(skill_info)
score = len(matched) / max(1, len(matched) + len(missing))
return {
"score": score,
"matched": matched,
"missing": missing,
"coverage": f"{len(matched)}/{len(matched) + len(missing)} core skills",
"summary": f"Matched {len(matched)}/{len(matched) + len(missing)} required skills"
}
def _explain_semantic(self, candidate, criteria) -> Dict:
"""Similarité sémantique CV vs job description."""
# Simplifié pour exemple
return {
"score": 0.75,
"reason": "Strong alignment with job description keywords",
"keywords_matched": ["python", "leadership", "frontend"],
"keywords_missing": ["kubernetes"],
}
def _explain_experience(self, candidate, criteria) -> Dict:
"""Évaluation expérience."""
years = candidate.years_experience or 0
return {
"score": min(years / 10.0, 1.0), # Cap at 1.0
"years": years,
"assessment": "Senior level" if years >= 5 else "Junior-Mid level",
}
def _explain_education(self, candidate, criteria) -> Dict:
"""Évaluation éducation."""
return {
"score": 0.8,
"degree": candidate.extracted_education or "Not specified",
"assessment": "Relevant background",
}
def _identify_strengths(self, skills_breakdown: Dict) -> List[str]:
"""Identifie top forces."""
matched = skills_breakdown.get("matched", [])
if not matched:
return []
# Top 3 par contribution
top = sorted(matched, key=lambda x: x["weight"], reverse=True)[:3]
return [f"{s['skill']} ({s['weight']}%)" for s in top]
def _identify_gaps(self, skills_breakdown: Dict) -> List[str]:
"""Identifie top gaps."""
missing = skills_breakdown.get("missing", [])
if not missing:
return []
# Top 3 par weight
top = sorted(missing, key=lambda x: x["weight"], reverse=True)[:3]
return [f"{s['skill']} ({s['weight']}%)" for s in top]
def _recommend_action(self, score: float) -> Dict:
"""Recommandation basée sur score."""
if score >= 0.80:
return {
"action": "ACCEPT - Interview now",
"rationale": "Strong match on core criteria",
"confidence": "High"
}
elif score >= 0.50:
return {
"action": "REVIEW - Phone screen first",
"rationale": "Good match but verify specific skills",
"confidence": "Medium"
}
else:
return {
"action": "PASS - Not aligned",
"rationale": "Missing too many core skills",
"confidence": "High"
}
def _calculate_confidence(self, candidate, criteria, skills_breakdown: Dict) -> float:
"""Confiance du scoring (0.0-1.0)."""
confidence = 0.8 # Base
# Penalize si peu de skills dans criteria
if len(criteria.criteria_skills) < 3:
confidence *= 0.7
# Boost si tous skills matchent
if len(skills_breakdown["missing"]) == 0:
confidence = min(confidence * 1.1, 1.0)
return round(confidence, 2)
# ============================================================================
# USAGE EXAMPLES
# ============================================================================
if __name__ == "__main__":
print("=" * 70)
print("Phase 1 Implementation Examples")
print("=" * 70)
# 1. Adaptive Thresholds
print("\n1️⃣ ADAPTIVE THRESHOLDS")
print("-" * 70)
threshold_engine = AdaptiveThresholdEngine()
test_jobs = [
"Senior Data Scientist",
"Financial Analyst",
"Startup Full Stack Developer",
]
for job in test_jobs:
result = threshold_engine.get_thresholds_with_explanation(job)
print(f"\nJob: {result['job_title']}")
print(f"Domain: {result['domain']}")
print(f"Thresholds: Accept={result['thresholds']['accept']:.0%}, Review={result['thresholds']['review']:.0%}")
print(f"Rationale: {result['rationale']}")
# 2. Smart Deduplication
print("\n\n2️⃣ SMART DEDUPLICATION")
print("-" * 70)
dedup = SmartSkillDeduplicator(similarity_threshold=0.82)
test_skills = [
["Python", "python", "python3"],
["JavaScript", "JS", "Node.js", "TypeScript"],
["Data Analysis", "Analytics", "Data Analytics"],
]
for skills in test_skills:
result = dedup.deduplicate(skills)
print(f"\nInput: {skills}")
print(f"Output: {result}")
# 3. Explainability (example structure)
print("\n\n3️⃣ ENHANCED EXPLAINABILITY")
print("-" * 70)
print("\nExample output structure:")
print("""
{
"timestamp": "2026-05-12T23:50:00.000000",
"candidate": {
"id": 1,
"name": "Ahmed Ben",
"email": "ahmed@example.com"
},
"score": {
"total": 0.847,
"percentage": "84.7%",
"components": {
"skills": 0.85,
"semantic": 0.72,
"experience": 0.9,
"education": 0.8
}
},
"insights": {
"strengths": ["Python (25%)", "Leadership (20%)", "Cloud (15%)"],
"gaps": ["Kubernetes (15%)", "DevOps (10%)"]
},
"decision": {
"recommendation": "ACCEPT - Interview now",
"rationale": "Strong match on core criteria",
"confidence": 0.92
}
}
""")
print("\n" + "=" * 70)
print("✅ Phase 1 Examples Complete")
print("=" * 70)