ilyass yani commited on
Commit ·
b38632c
1
Parent(s): 4cc2fc6
fix: nettoyage final extraction - companies propres + filtre interets generique
Browse files- app/services/cv_extractor.py +36 -0
app/services/cv_extractor.py
CHANGED
|
@@ -307,6 +307,11 @@ class CVExtractionService:
|
|
| 307 |
merged[key] = hf[key]
|
| 308 |
|
| 309 |
# Merge list fields with de-duplication while preserving order.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
list_keys = [
|
| 311 |
"emails", "phones", "companies", "job_titles", "education", "skills",
|
| 312 |
"languages", "soft_skills", "interests", "certifications", "projects",
|
|
@@ -315,6 +320,9 @@ class CVExtractionService:
|
|
| 315 |
for key in list_keys:
|
| 316 |
base_list = merged.get(key) if isinstance(merged.get(key), list) else []
|
| 317 |
hf_list = hf.get(key) if isinstance(hf.get(key), list) else []
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
combined = []
|
| 320 |
seen = set()
|
|
@@ -344,6 +352,34 @@ class CVExtractionService:
|
|
| 344 |
"""Normalize and validate extracted entities to improve precision."""
|
| 345 |
cleaned = dict(structured or {})
|
| 346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
cleaned["emails"] = self._clean_emails(cleaned.get("emails"), cleaned.get("email"))
|
| 348 |
cleaned["email"] = cleaned["emails"][0] if cleaned["emails"] else None
|
| 349 |
|
|
|
|
| 307 |
merged[key] = hf[key]
|
| 308 |
|
| 309 |
# Merge list fields with de-duplication while preserving order.
|
| 310 |
+
# GLiNER-owned fields (companies, education, job_titles) are NOT merged
|
| 311 |
+
# with the BERT/legacy output when GLiNER already produced a result:
|
| 312 |
+
# BERT introduces wordpiece artifacts (##cence, ##P) and fragments
|
| 313 |
+
# (Esp, Li) that would pollute the clean GLiNER lists.
|
| 314 |
+
gliner_owned = ("companies", "education", "job_titles", "interests")
|
| 315 |
list_keys = [
|
| 316 |
"emails", "phones", "companies", "job_titles", "education", "skills",
|
| 317 |
"languages", "soft_skills", "interests", "certifications", "projects",
|
|
|
|
| 320 |
for key in list_keys:
|
| 321 |
base_list = merged.get(key) if isinstance(merged.get(key), list) else []
|
| 322 |
hf_list = hf.get(key) if isinstance(hf.get(key), list) else []
|
| 323 |
+
# Keep the clean GLiNER list untouched for its owned fields.
|
| 324 |
+
if key in gliner_owned and base_list:
|
| 325 |
+
continue
|
| 326 |
|
| 327 |
combined = []
|
| 328 |
seen = set()
|
|
|
|
| 352 |
"""Normalize and validate extracted entities to improve precision."""
|
| 353 |
cleaned = dict(structured or {})
|
| 354 |
|
| 355 |
+
# Clean interests (generic): drop form labels (/ or |), the candidate's
|
| 356 |
+
# own name (case-insensitive), and CV section headers that leaked in.
|
| 357 |
+
_name_norm = (cleaned.get("full_name") or cleaned.get("name") or "").strip().lower()
|
| 358 |
+
_section_words = {
|
| 359 |
+
"intitule du poste", "intitule du poste / stage", "intitule",
|
| 360 |
+
"profil", "profile", "contact", "langues", "languages",
|
| 361 |
+
"competences", "competence", "skills", "formation", "formations",
|
| 362 |
+
"experience", "experiences", "education", "centres d interet",
|
| 363 |
+
"objectif", "objectifs", "references", "projets", "certifications",
|
| 364 |
+
}
|
| 365 |
+
_src = cleaned.get("interests") if isinstance(cleaned.get("interests"), list) else []
|
| 366 |
+
_clean_int = []
|
| 367 |
+
_seen_int = set()
|
| 368 |
+
for _it in _src:
|
| 369 |
+
_v = str(_it or "").strip()
|
| 370 |
+
if not _v or "/" in _v or "|" in _v:
|
| 371 |
+
continue
|
| 372 |
+
_low = _v.lower()
|
| 373 |
+
if _name_norm and _low == _name_norm:
|
| 374 |
+
continue
|
| 375 |
+
if _low in _section_words:
|
| 376 |
+
continue
|
| 377 |
+
if _low in _seen_int:
|
| 378 |
+
continue
|
| 379 |
+
_seen_int.add(_low)
|
| 380 |
+
_clean_int.append(_v)
|
| 381 |
+
cleaned["interests"] = _clean_int
|
| 382 |
+
|
| 383 |
cleaned["emails"] = self._clean_emails(cleaned.get("emails"), cleaned.get("email"))
|
| 384 |
cleaned["email"] = cleaned["emails"][0] if cleaned["emails"] else None
|
| 385 |
|