ilyass yani commited on
Commit
9614c9c
·
1 Parent(s): 9e6bc9f

Batch-2: fix CosineScorer, delete/put candidats, BOM extraction, ranking, _can_access_profile

Browse files
ai_module/matching/__init__.py CHANGED
@@ -9,11 +9,13 @@
9
  # from ai_module.matching import BertClassifierAdapter
10
  from ai_module.matching.hybrid_matcher import HybridConfig, HybridMatcher
11
  from ai_module.matching.bert_classifier_adapter import BertClassifierAdapter, get_default_adapter
 
12
 
13
  __all__ = [
14
  "HybridConfig",
15
  "HybridMatcher",
16
  "BertClassifierAdapter",
17
  "get_default_adapter",
 
18
  ]
19
 
 
9
  # from ai_module.matching import BertClassifierAdapter
10
  from ai_module.matching.hybrid_matcher import HybridConfig, HybridMatcher
11
  from ai_module.matching.bert_classifier_adapter import BertClassifierAdapter, get_default_adapter
12
+ from ai_module.matching.scorer import CosineScorer
13
 
14
  __all__ = [
15
  "HybridConfig",
16
  "HybridMatcher",
17
  "BertClassifierAdapter",
18
  "get_default_adapter",
19
+ "CosineScorer",
20
  ]
21
 
ai_module/nlp/resume_ner_extractor.py CHANGED
@@ -61,6 +61,63 @@ class ResumeNERExtractor:
61
  # Other Tools
62
  "jira", "confluence", "slack", "discord", "figma", "sketch",
63
  "vim", "vscode", "intellij",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  }
65
 
66
  # Job title keywords
 
61
  # Other Tools
62
  "jira", "confluence", "slack", "discord", "figma", "sketch",
63
  "vim", "vscode", "intellij",
64
+
65
+ # Sante / Health
66
+ "infirmier", "infirmiere", "pharmacien", "pharmacienne", "chirurgie",
67
+ "soins infirmiers", "soins intensifs", "bloc operatoire", "urgences",
68
+ "radiologie", "anesthesie", "pediatrie", "geriatrie", "cardiologie",
69
+ "neurologie", "oncologie", "kinesitherapie", "orthophonie",
70
+ "aide soignant", "auxiliaire de vie", "sage femme", "medecin",
71
+ "urgentiste", "generaliste", "specialiste", "imagerie medicale",
72
+ "pharmacologie", "biologie medicale", "dossier patient", "hip",
73
+
74
+ # Commerce / Vente
75
+ "negociation", "prospection", "crm", "salesforce", "hubspot",
76
+ "vente", "commerce", "relation client", "fidélisation", "fidélisation client",
77
+ "fidelisation", "pipeline commercial", "force de vente", "b2b", "b2c",
78
+ "cold calling", "account management", "key account", "grands comptes",
79
+ "administration des ventes", "devis", "facturation", "customer success",
80
+ "closing", "lead generation",
81
+
82
+ # Finance / Comptabilite
83
+ "comptabilite", "comptabilité", "audit", "controle de gestion",
84
+ "excel financier", "ifrs", "normes ifrs", "bilan", "liasses fiscales",
85
+ "gestion budgetaire", "tresorerie", "fiscalite", "sage comptabilité",
86
+ "sage compta", "sage", "cegid", "erp finance", "consolidation",
87
+ "reporting financier", "analyse financiere", "due diligence",
88
+ "commissariat aux comptes", "expert comptable", "bilan comptable",
89
+ "grand livre", "journaux comptables", "pcg", "tva",
90
+
91
+ # Marketing
92
+ "seo", "sea", "sem", "content marketing", "branding", "brand management",
93
+ "marketing digital", "emailing", "e-mailing", "google analytics",
94
+ "google ads", "facebook ads", "meta ads", "reseaux sociaux",
95
+ "community management", "inbound marketing", "marketing automation",
96
+ "mailchimp", "hubspot marketing", "copywriting", "ux writing",
97
+ "webmarketing", "e-commerce", "shopify", "growth hacking",
98
+
99
+ # BTP / Construction
100
+ "conduite de chantier", "autocad", "maconnerie", "gros oeuvre",
101
+ "second oeuvre", "menuiserie", "plomberie", "electricite batiment",
102
+ "genie civil", "architecture", "bim", "revit", "suivi de chantier",
103
+ "planification chantier", "metres", "cubature", "beton arme",
104
+ "coffreur", "platrerie", "carrelage", "peinture batiment",
105
+ "chef de chantier", "conducteur de travaux",
106
+
107
+ # Droit / Juridique
108
+ "droit du travail", "contrats", "contentieux", "droit commercial",
109
+ "droit civil", "droit penal", "jurisprudence", "redaction juridique",
110
+ "negociation contractuelle", "veille juridique", "rgpd", "droit des affaires",
111
+ "droit de la propriete intellectuelle", "droit des societes",
112
+ "recouvrement de creances", "arbitrage", "mediation",
113
+
114
+ # Ressources Humaines / RH
115
+ "recrutement", "gpec", "gestion des talents", "paie",
116
+ "administration du personnel", "droit social", "formation professionnelle",
117
+ "onboarding", "marque employeur", "entretien professionnel",
118
+ "gestion des competences", "bilan social", "dsn", "sirh",
119
+ "workday", "peoplesoft", "adp", "talent management",
120
+ "sourcing", "assessment center", "mobilite interne",
121
  }
122
 
123
  # Job title keywords
app/api/candidates.py CHANGED
@@ -30,7 +30,6 @@ def _can_access_profile(profile: Candidate, requesting_user: User) -> bool:
30
 
31
  Rules:
32
  - Owner always has access.
33
- - A recruiter can read candidates they uploaded themselves.
34
  - An authenticated recruiter can read a candidate-deposited profile that
35
  has is_visible = True.
36
  - Everything else is denied (return False → caller raises 404).
@@ -665,7 +664,11 @@ def update_candidate(
665
  if not db_candidate:
666
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
667
 
668
- if db_candidate.user_id != current_user.id and current_user.role != UserRole.admin:
 
 
 
 
669
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
670
 
671
  for key, value in candidate.dict(exclude_unset=True).items():
@@ -691,7 +694,11 @@ def delete_candidate(
691
  if not db_candidate:
692
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
693
 
694
- if db_candidate.user_id != current_user.id and current_user.role != UserRole.admin:
 
 
 
 
695
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
696
 
697
  db.delete(db_candidate)
 
30
 
31
  Rules:
32
  - Owner always has access.
 
33
  - An authenticated recruiter can read a candidate-deposited profile that
34
  has is_visible = True.
35
  - Everything else is denied (return False → caller raises 404).
 
664
  if not db_candidate:
665
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
666
 
667
+ is_owner = (
668
+ (db_candidate.user_id is not None and db_candidate.user_id == current_user.id)
669
+ or (db_candidate.recruiter_id is not None and db_candidate.recruiter_id == current_user.id)
670
+ )
671
+ if not is_owner and current_user.role != UserRole.admin:
672
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
673
 
674
  for key, value in candidate.dict(exclude_unset=True).items():
 
694
  if not db_candidate:
695
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
696
 
697
+ is_owner = (
698
+ (db_candidate.user_id is not None and db_candidate.user_id == current_user.id)
699
+ or (db_candidate.recruiter_id is not None and db_candidate.recruiter_id == current_user.id)
700
+ )
701
+ if not is_owner and current_user.role != UserRole.admin:
702
  raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Candidate not found")
703
 
704
  db.delete(db_candidate)
app/api/matching.py CHANGED
@@ -1053,6 +1053,69 @@ async def generate_and_match(
1053
  }
1054
 
1055
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1056
  @router.post("/{criteria_id:int}/results", response_model=List[CriteriaMatchResultResponse])
1057
  async def launch_matching_for_criteria(
1058
  criteria_id: int,
 
1053
  }
1054
 
1055
 
1056
+ class RankAllResult(BaseModel):
1057
+ """Ranked candidate entry for rank-all endpoint."""
1058
+ rank: int
1059
+ candidate_id: int
1060
+ full_name: str
1061
+ email: str
1062
+ score: float
1063
+ coverage: float
1064
+ matched_skills: List[str]
1065
+ missing_skills: List[str]
1066
+
1067
+ class Config:
1068
+ from_attributes = True
1069
+
1070
+
1071
+ @router.get("/{criteria_id}/rank-all", response_model=List[RankAllResult])
1072
+ def rank_all_candidates(
1073
+ criteria_id: int,
1074
+ current_user: User = Depends(get_current_user),
1075
+ db: Session = Depends(get_db),
1076
+ ):
1077
+ """Return all recruiter candidates scored and ranked by match score (best first).
1078
+
1079
+ Only candidates belonging to the current recruiter (recruiter_id or user_id) are
1080
+ included. The score is computed on-the-fly using CosineScorer so that even
1081
+ candidates without prior MatchResult records are ranked.
1082
+ """
1083
+ from sqlalchemy import or_ as sa_or
1084
+ criteria = db.query(JobCriteria).filter(JobCriteria.id == criteria_id).first()
1085
+ if not criteria:
1086
+ raise HTTPException(status_code=404, detail="Criteria not found")
1087
+
1088
+ criteria_skills = _load_criteria_skills(criteria_id, db)
1089
+ skill_universe = build_skill_universe(db)
1090
+
1091
+ candidates = (
1092
+ db.query(Candidate)
1093
+ .filter(
1094
+ sa_or(
1095
+ Candidate.recruiter_id == current_user.id,
1096
+ Candidate.user_id == current_user.id,
1097
+ )
1098
+ )
1099
+ .all()
1100
+ )
1101
+
1102
+ ranked: List[Dict] = []
1103
+ for cand in candidates:
1104
+ score, details = score_candidate_against_criteria(cand, criteria_skills, skill_universe)
1105
+ ranked.append({
1106
+ "candidate_id": cast(int, cand.id),
1107
+ "full_name": cast(str, cand.full_name or ""),
1108
+ "email": cast(str, cand.email or ""),
1109
+ "score": score,
1110
+ "coverage": float(details.get("coverage", 0)),
1111
+ "matched_skills": list(details.get("matched_skills", [])),
1112
+ "missing_skills": list(details.get("missing_skills", [])),
1113
+ })
1114
+
1115
+ ranked.sort(key=lambda item: item["score"], reverse=True)
1116
+ return [RankAllResult(rank=idx + 1, **entry) for idx, entry in enumerate(ranked)]
1117
+
1118
+
1119
  @router.post("/{criteria_id:int}/results", response_model=List[CriteriaMatchResultResponse])
1120
  async def launch_matching_for_criteria(
1121
  criteria_id: int,
app/services/cv_extractor.py CHANGED
@@ -356,13 +356,17 @@ class CVExtractionService:
356
 
357
  def _clean_name(self, name: Any) -> Optional[str]:
358
  value = str(name or "").strip()
 
 
 
359
  if not value:
360
  return None
361
  if "@" in value or "http" in value.lower():
362
  return None
363
  if any(ch.isdigit() for ch in value):
364
  return None
365
- words = [w for w in re.split(r"\s+", value) if w]
 
366
  if len(words) < 2 or len(words) > 4:
367
  return None
368
  return " ".join(word.capitalize() for word in words)
@@ -460,7 +464,10 @@ class CVExtractionService:
460
 
461
  def _normalize_text_for_extraction(self, text: str) -> str:
462
  """Normalize noisy PDF extraction output to improve entity detection."""
463
- normalized = text.replace("\r", "\n")
 
 
 
464
  normalized = re.sub(r"[ \t]+", " ", normalized)
465
  normalized = re.sub(r"\n{3,}", "\n\n", normalized)
466
  return normalized.strip()
 
356
 
357
  def _clean_name(self, name: Any) -> Optional[str]:
358
  value = str(name or "").strip()
359
+ # Strip UTF-8 BOM and zero-width chars that cause capitalize() to lowercase
360
+ # the real first letter (BOM becomes the first char, everything else lowercased).
361
+ value = value.lstrip("￾​‌‍⁠").strip()
362
  if not value:
363
  return None
364
  if "@" in value or "http" in value.lower():
365
  return None
366
  if any(ch.isdigit() for ch in value):
367
  return None
368
+ words = [w.lstrip("￾​‌‍⁠") for w in re.split(r"\s+", value) if w]
369
+ words = [w for w in words if w]
370
  if len(words) < 2 or len(words) > 4:
371
  return None
372
  return " ".join(word.capitalize() for word in words)
 
464
 
465
  def _normalize_text_for_extraction(self, text: str) -> str:
466
  """Normalize noisy PDF extraction output to improve entity detection."""
467
+ # Remove UTF-8 BOM () and similar zero-width chars that corrupt the first
468
+ # word of extracted text and cause name capitalize() bugs.
469
+ normalized = text.lstrip("￾​‌‍")
470
+ normalized = normalized.replace("\r", "\n")
471
  normalized = re.sub(r"[ \t]+", " ", normalized)
472
  normalized = re.sub(r"\n{3,}", "\n\n", normalized)
473
  return normalized.strip()