ilyass yani commited on
Commit ·
9614c9c
1
Parent(s): 9e6bc9f
Batch-2: fix CosineScorer, delete/put candidats, BOM extraction, ranking, _can_access_profile
Browse files- ai_module/matching/__init__.py +2 -0
- ai_module/nlp/resume_ner_extractor.py +57 -0
- app/api/candidates.py +10 -3
- app/api/matching.py +63 -0
- app/services/cv_extractor.py +9 -2
ai_module/matching/__init__.py
CHANGED
|
@@ -9,11 +9,13 @@
|
|
| 9 |
# from ai_module.matching import BertClassifierAdapter
|
| 10 |
from ai_module.matching.hybrid_matcher import HybridConfig, HybridMatcher
|
| 11 |
from ai_module.matching.bert_classifier_adapter import BertClassifierAdapter, get_default_adapter
|
|
|
|
| 12 |
|
| 13 |
__all__ = [
|
| 14 |
"HybridConfig",
|
| 15 |
"HybridMatcher",
|
| 16 |
"BertClassifierAdapter",
|
| 17 |
"get_default_adapter",
|
|
|
|
| 18 |
]
|
| 19 |
|
|
|
|
| 9 |
# from ai_module.matching import BertClassifierAdapter
|
| 10 |
from ai_module.matching.hybrid_matcher import HybridConfig, HybridMatcher
|
| 11 |
from ai_module.matching.bert_classifier_adapter import BertClassifierAdapter, get_default_adapter
|
| 12 |
+
from ai_module.matching.scorer import CosineScorer
|
| 13 |
|
| 14 |
__all__ = [
|
| 15 |
"HybridConfig",
|
| 16 |
"HybridMatcher",
|
| 17 |
"BertClassifierAdapter",
|
| 18 |
"get_default_adapter",
|
| 19 |
+
"CosineScorer",
|
| 20 |
]
|
| 21 |
|
ai_module/nlp/resume_ner_extractor.py
CHANGED
|
@@ -61,6 +61,63 @@ class ResumeNERExtractor:
|
|
| 61 |
# Other Tools
|
| 62 |
"jira", "confluence", "slack", "discord", "figma", "sketch",
|
| 63 |
"vim", "vscode", "intellij",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
}
|
| 65 |
|
| 66 |
# Job title keywords
|
|
|
|
| 61 |
# Other Tools
|
| 62 |
"jira", "confluence", "slack", "discord", "figma", "sketch",
|
| 63 |
"vim", "vscode", "intellij",
|
| 64 |
+
|
| 65 |
+
# Sante / Health
|
| 66 |
+
"infirmier", "infirmiere", "pharmacien", "pharmacienne", "chirurgie",
|
| 67 |
+
"soins infirmiers", "soins intensifs", "bloc operatoire", "urgences",
|
| 68 |
+
"radiologie", "anesthesie", "pediatrie", "geriatrie", "cardiologie",
|
| 69 |
+
"neurologie", "oncologie", "kinesitherapie", "orthophonie",
|
| 70 |
+
"aide soignant", "auxiliaire de vie", "sage femme", "medecin",
|
| 71 |
+
"urgentiste", "generaliste", "specialiste", "imagerie medicale",
|
| 72 |
+
"pharmacologie", "biologie medicale", "dossier patient", "hip",
|
| 73 |
+
|
| 74 |
+
# Commerce / Vente
|
| 75 |
+
"negociation", "prospection", "crm", "salesforce", "hubspot",
|
| 76 |
+
"vente", "commerce", "relation client", "fidélisation", "fidélisation client",
|
| 77 |
+
"fidelisation", "pipeline commercial", "force de vente", "b2b", "b2c",
|
| 78 |
+
"cold calling", "account management", "key account", "grands comptes",
|
| 79 |
+
"administration des ventes", "devis", "facturation", "customer success",
|
| 80 |
+
"closing", "lead generation",
|
| 81 |
+
|
| 82 |
+
# Finance / Comptabilite
|
| 83 |
+
"comptabilite", "comptabilité", "audit", "controle de gestion",
|
| 84 |
+
"excel financier", "ifrs", "normes ifrs", "bilan", "liasses fiscales",
|
| 85 |
+
"gestion budgetaire", "tresorerie", "fiscalite", "sage comptabilité",
|
| 86 |
+
"sage compta", "sage", "cegid", "erp finance", "consolidation",
|
| 87 |
+
"reporting financier", "analyse financiere", "due diligence",
|
| 88 |
+
"commissariat aux comptes", "expert comptable", "bilan comptable",
|
| 89 |
+
"grand livre", "journaux comptables", "pcg", "tva",
|
| 90 |
+
|
| 91 |
+
# Marketing
|
| 92 |
+
"seo", "sea", "sem", "content marketing", "branding", "brand management",
|
| 93 |
+
"marketing digital", "emailing", "e-mailing", "google analytics",
|
| 94 |
+
"google ads", "facebook ads", "meta ads", "reseaux sociaux",
|
| 95 |
+
"community management", "inbound marketing", "marketing automation",
|
| 96 |
+
"mailchimp", "hubspot marketing", "copywriting", "ux writing",
|
| 97 |
+
"webmarketing", "e-commerce", "shopify", "growth hacking",
|
| 98 |
+
|
| 99 |
+
# BTP / Construction
|
| 100 |
+
"conduite de chantier", "autocad", "maconnerie", "gros oeuvre",
|
| 101 |
+
"second oeuvre", "menuiserie", "plomberie", "electricite batiment",
|
| 102 |
+
"genie civil", "architecture", "bim", "revit", "suivi de chantier",
|
| 103 |
+
"planification chantier", "metres", "cubature", "beton arme",
|
| 104 |
+
"coffreur", "platrerie", "carrelage", "peinture batiment",
|
| 105 |
+
"chef de chantier", "conducteur de travaux",
|
| 106 |
+
|
| 107 |
+
# Droit / Juridique
|
| 108 |
+
"droit du travail", "contrats", "contentieux", "droit commercial",
|
| 109 |
+
"droit civil", "droit penal", "jurisprudence", "redaction juridique",
|
| 110 |
+
"negociation contractuelle", "veille juridique", "rgpd", "droit des affaires",
|
| 111 |
+
"droit de la propriete intellectuelle", "droit des societes",
|
| 112 |
+
"recouvrement de creances", "arbitrage", "mediation",
|
| 113 |
+
|
| 114 |
+
# Ressources Humaines / RH
|
| 115 |
+
"recrutement", "gpec", "gestion des talents", "paie",
|
| 116 |
+
"administration du personnel", "droit social", "formation professionnelle",
|
| 117 |
+
"onboarding", "marque employeur", "entretien professionnel",
|
| 118 |
+
"gestion des competences", "bilan social", "dsn", "sirh",
|
| 119 |
+
"workday", "peoplesoft", "adp", "talent management",
|
| 120 |
+
"sourcing", "assessment center", "mobilite interne",
|
| 121 |
}
|
| 122 |
|
| 123 |
# Job title keywords
|
app/api/candidates.py
CHANGED
|
@@ -30,7 +30,6 @@ def _can_access_profile(profile: Candidate, requesting_user: User) -> bool:
|
|
| 30 |
|
| 31 |
Rules:
|
| 32 |
- Owner always has access.
|
| 33 |
-
- A recruiter can read candidates they uploaded themselves.
|
| 34 |
- An authenticated recruiter can read a candidate-deposited profile that
|
| 35 |
has is_visible = True.
|
| 36 |
- Everything else is denied (return False → caller raises 404).
|
|
@@ -665,7 +664,11 @@ def update_candidate(
|
|
| 665 |
if not db_candidate:
|
| 666 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 667 |
|
| 668 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 669 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 670 |
|
| 671 |
for key, value in candidate.dict(exclude_unset=True).items():
|
|
@@ -691,7 +694,11 @@ def delete_candidate(
|
|
| 691 |
if not db_candidate:
|
| 692 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 693 |
|
| 694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 696 |
|
| 697 |
db.delete(db_candidate)
|
|
|
|
| 30 |
|
| 31 |
Rules:
|
| 32 |
- Owner always has access.
|
|
|
|
| 33 |
- An authenticated recruiter can read a candidate-deposited profile that
|
| 34 |
has is_visible = True.
|
| 35 |
- Everything else is denied (return False → caller raises 404).
|
|
|
|
| 664 |
if not db_candidate:
|
| 665 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 666 |
|
| 667 |
+
is_owner = (
|
| 668 |
+
(db_candidate.user_id is not None and db_candidate.user_id == current_user.id)
|
| 669 |
+
or (db_candidate.recruiter_id is not None and db_candidate.recruiter_id == current_user.id)
|
| 670 |
+
)
|
| 671 |
+
if not is_owner and current_user.role != UserRole.admin:
|
| 672 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 673 |
|
| 674 |
for key, value in candidate.dict(exclude_unset=True).items():
|
|
|
|
| 694 |
if not db_candidate:
|
| 695 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 696 |
|
| 697 |
+
is_owner = (
|
| 698 |
+
(db_candidate.user_id is not None and db_candidate.user_id == current_user.id)
|
| 699 |
+
or (db_candidate.recruiter_id is not None and db_candidate.recruiter_id == current_user.id)
|
| 700 |
+
)
|
| 701 |
+
if not is_owner and current_user.role != UserRole.admin:
|
| 702 |
raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
|
| 703 |
|
| 704 |
db.delete(db_candidate)
|
app/api/matching.py
CHANGED
|
@@ -1053,6 +1053,69 @@ async def generate_and_match(
|
|
| 1053 |
}
|
| 1054 |
|
| 1055 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1056 |
@router.post("/{criteria_id:int}/results", response_model=List[CriteriaMatchResultResponse])
|
| 1057 |
async def launch_matching_for_criteria(
|
| 1058 |
criteria_id: int,
|
|
|
|
| 1053 |
}
|
| 1054 |
|
| 1055 |
|
| 1056 |
+
class RankAllResult(BaseModel):
|
| 1057 |
+
"""Ranked candidate entry for rank-all endpoint."""
|
| 1058 |
+
rank: int
|
| 1059 |
+
candidate_id: int
|
| 1060 |
+
full_name: str
|
| 1061 |
+
email: str
|
| 1062 |
+
score: float
|
| 1063 |
+
coverage: float
|
| 1064 |
+
matched_skills: List[str]
|
| 1065 |
+
missing_skills: List[str]
|
| 1066 |
+
|
| 1067 |
+
class Config:
|
| 1068 |
+
from_attributes = True
|
| 1069 |
+
|
| 1070 |
+
|
| 1071 |
+
@router.get("/{criteria_id}/rank-all", response_model=List[RankAllResult])
|
| 1072 |
+
def rank_all_candidates(
|
| 1073 |
+
criteria_id: int,
|
| 1074 |
+
current_user: User = Depends(get_current_user),
|
| 1075 |
+
db: Session = Depends(get_db),
|
| 1076 |
+
):
|
| 1077 |
+
"""Return all recruiter candidates scored and ranked by match score (best first).
|
| 1078 |
+
|
| 1079 |
+
Only candidates belonging to the current recruiter (recruiter_id or user_id) are
|
| 1080 |
+
included. The score is computed on-the-fly using CosineScorer so that even
|
| 1081 |
+
candidates without prior MatchResult records are ranked.
|
| 1082 |
+
"""
|
| 1083 |
+
from sqlalchemy import or_ as sa_or
|
| 1084 |
+
criteria = db.query(JobCriteria).filter(JobCriteria.id == criteria_id).first()
|
| 1085 |
+
if not criteria:
|
| 1086 |
+
raise HTTPException(status_code=404, detail="Criteria not found")
|
| 1087 |
+
|
| 1088 |
+
criteria_skills = _load_criteria_skills(criteria_id, db)
|
| 1089 |
+
skill_universe = build_skill_universe(db)
|
| 1090 |
+
|
| 1091 |
+
candidates = (
|
| 1092 |
+
db.query(Candidate)
|
| 1093 |
+
.filter(
|
| 1094 |
+
sa_or(
|
| 1095 |
+
Candidate.recruiter_id == current_user.id,
|
| 1096 |
+
Candidate.user_id == current_user.id,
|
| 1097 |
+
)
|
| 1098 |
+
)
|
| 1099 |
+
.all()
|
| 1100 |
+
)
|
| 1101 |
+
|
| 1102 |
+
ranked: List[Dict] = []
|
| 1103 |
+
for cand in candidates:
|
| 1104 |
+
score, details = score_candidate_against_criteria(cand, criteria_skills, skill_universe)
|
| 1105 |
+
ranked.append({
|
| 1106 |
+
"candidate_id": cast(int, cand.id),
|
| 1107 |
+
"full_name": cast(str, cand.full_name or ""),
|
| 1108 |
+
"email": cast(str, cand.email or ""),
|
| 1109 |
+
"score": score,
|
| 1110 |
+
"coverage": float(details.get("coverage", 0)),
|
| 1111 |
+
"matched_skills": list(details.get("matched_skills", [])),
|
| 1112 |
+
"missing_skills": list(details.get("missing_skills", [])),
|
| 1113 |
+
})
|
| 1114 |
+
|
| 1115 |
+
ranked.sort(key=lambda item: item["score"], reverse=True)
|
| 1116 |
+
return [RankAllResult(rank=idx + 1, **entry) for idx, entry in enumerate(ranked)]
|
| 1117 |
+
|
| 1118 |
+
|
| 1119 |
@router.post("/{criteria_id:int}/results", response_model=List[CriteriaMatchResultResponse])
|
| 1120 |
async def launch_matching_for_criteria(
|
| 1121 |
criteria_id: int,
|
app/services/cv_extractor.py
CHANGED
|
@@ -356,13 +356,17 @@ class CVExtractionService:
|
|
| 356 |
|
| 357 |
def _clean_name(self, name: Any) -> Optional[str]:
|
| 358 |
value = str(name or "").strip()
|
|
|
|
|
|
|
|
|
|
| 359 |
if not value:
|
| 360 |
return None
|
| 361 |
if "@" in value or "http" in value.lower():
|
| 362 |
return None
|
| 363 |
if any(ch.isdigit() for ch in value):
|
| 364 |
return None
|
| 365 |
-
words = [w for w in re.split(r"\s+", value) if w]
|
|
|
|
| 366 |
if len(words) < 2 or len(words) > 4:
|
| 367 |
return None
|
| 368 |
return " ".join(word.capitalize() for word in words)
|
|
@@ -460,7 +464,10 @@ class CVExtractionService:
|
|
| 460 |
|
| 461 |
def _normalize_text_for_extraction(self, text: str) -> str:
|
| 462 |
"""Normalize noisy PDF extraction output to improve entity detection."""
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
| 464 |
normalized = re.sub(r"[ \t]+", " ", normalized)
|
| 465 |
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
|
| 466 |
return normalized.strip()
|
|
|
|
| 356 |
|
| 357 |
def _clean_name(self, name: Any) -> Optional[str]:
|
| 358 |
value = str(name or "").strip()
|
| 359 |
+
# Strip UTF-8 BOM and zero-width chars that cause capitalize() to lowercase
|
| 360 |
+
# the real first letter (BOM becomes the first char, everything else lowercased).
|
| 361 |
+
value = value.lstrip("").strip()
|
| 362 |
if not value:
|
| 363 |
return None
|
| 364 |
if "@" in value or "http" in value.lower():
|
| 365 |
return None
|
| 366 |
if any(ch.isdigit() for ch in value):
|
| 367 |
return None
|
| 368 |
+
words = [w.lstrip("") for w in re.split(r"\s+", value) if w]
|
| 369 |
+
words = [w for w in words if w]
|
| 370 |
if len(words) < 2 or len(words) > 4:
|
| 371 |
return None
|
| 372 |
return " ".join(word.capitalize() for word in words)
|
|
|
|
| 464 |
|
| 465 |
def _normalize_text_for_extraction(self, text: str) -> str:
|
| 466 |
"""Normalize noisy PDF extraction output to improve entity detection."""
|
| 467 |
+
# Remove UTF-8 BOM () and similar zero-width chars that corrupt the first
|
| 468 |
+
# word of extracted text and cause name capitalize() bugs.
|
| 469 |
+
normalized = text.lstrip("")
|
| 470 |
+
normalized = normalized.replace("\r", "\n")
|
| 471 |
normalized = re.sub(r"[ \t]+", " ", normalized)
|
| 472 |
normalized = re.sub(r"\n{3,}", "\n\n", normalized)
|
| 473 |
return normalized.strip()
|