ilyass yani commited on
Commit
b38632c
·
1 Parent(s): 4cc2fc6

fix: nettoyage final extraction - companies propres + filtre interets generique

Browse files
Files changed (1) hide show
  1. app/services/cv_extractor.py +36 -0
app/services/cv_extractor.py CHANGED
@@ -307,6 +307,11 @@ class CVExtractionService:
307
  merged[key] = hf[key]
308
 
309
  # Merge list fields with de-duplication while preserving order.
 
 
 
 
 
310
  list_keys = [
311
  "emails", "phones", "companies", "job_titles", "education", "skills",
312
  "languages", "soft_skills", "interests", "certifications", "projects",
@@ -315,6 +320,9 @@ class CVExtractionService:
315
  for key in list_keys:
316
  base_list = merged.get(key) if isinstance(merged.get(key), list) else []
317
  hf_list = hf.get(key) if isinstance(hf.get(key), list) else []
 
 
 
318
 
319
  combined = []
320
  seen = set()
@@ -344,6 +352,34 @@ class CVExtractionService:
344
  """Normalize and validate extracted entities to improve precision."""
345
  cleaned = dict(structured or {})
346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  cleaned["emails"] = self._clean_emails(cleaned.get("emails"), cleaned.get("email"))
348
  cleaned["email"] = cleaned["emails"][0] if cleaned["emails"] else None
349
 
 
307
  merged[key] = hf[key]
308
 
309
  # Merge list fields with de-duplication while preserving order.
310
+ # GLiNER-owned fields (companies, education, job_titles) are NOT merged
311
+ # with the BERT/legacy output when GLiNER already produced a result:
312
+ # BERT introduces wordpiece artifacts (##cence, ##P) and fragments
313
+ # (Esp, Li) that would pollute the clean GLiNER lists.
314
+ gliner_owned = ("companies", "education", "job_titles", "interests")
315
  list_keys = [
316
  "emails", "phones", "companies", "job_titles", "education", "skills",
317
  "languages", "soft_skills", "interests", "certifications", "projects",
 
320
  for key in list_keys:
321
  base_list = merged.get(key) if isinstance(merged.get(key), list) else []
322
  hf_list = hf.get(key) if isinstance(hf.get(key), list) else []
323
+ # Keep the clean GLiNER list untouched for its owned fields.
324
+ if key in gliner_owned and base_list:
325
+ continue
326
 
327
  combined = []
328
  seen = set()
 
352
  """Normalize and validate extracted entities to improve precision."""
353
  cleaned = dict(structured or {})
354
 
355
+ # Clean interests (generic): drop form labels (/ or |), the candidate's
356
+ # own name (case-insensitive), and CV section headers that leaked in.
357
+ _name_norm = (cleaned.get("full_name") or cleaned.get("name") or "").strip().lower()
358
+ _section_words = {
359
+ "intitule du poste", "intitule du poste / stage", "intitule",
360
+ "profil", "profile", "contact", "langues", "languages",
361
+ "competences", "competence", "skills", "formation", "formations",
362
+ "experience", "experiences", "education", "centres d interet",
363
+ "objectif", "objectifs", "references", "projets", "certifications",
364
+ }
365
+ _src = cleaned.get("interests") if isinstance(cleaned.get("interests"), list) else []
366
+ _clean_int = []
367
+ _seen_int = set()
368
+ for _it in _src:
369
+ _v = str(_it or "").strip()
370
+ if not _v or "/" in _v or "|" in _v:
371
+ continue
372
+ _low = _v.lower()
373
+ if _name_norm and _low == _name_norm:
374
+ continue
375
+ if _low in _section_words:
376
+ continue
377
+ if _low in _seen_int:
378
+ continue
379
+ _seen_int.add(_low)
380
+ _clean_int.append(_v)
381
+ cleaned["interests"] = _clean_int
382
+
383
  cleaned["emails"] = self._clean_emails(cleaned.get("emails"), cleaned.get("email"))
384
  cleaned["email"] = cleaned["emails"][0] if cleaned["emails"] else None
385