| |
| """ |
| Multi-CV extraction benchmark. |
| |
| Runs the CV extraction pipeline against several resume layouts and reports |
| coverage scores per section so we can track robustness across formats. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import re |
| from dataclasses import dataclass |
| from pathlib import Path |
| from typing import Any, Dict, List, Optional |
|
|
| from app.services.cv_extractor import CVExtractionService |
|
|
|
|
| @dataclass |
| class BenchmarkSample: |
| name: str |
| text: str |
| expected: Dict[str, int] |
|
|
|
|
| def _load_reference_text() -> str: |
| """Best-effort loader for the existing test CV fixture.""" |
| fixture_path = Path(__file__).with_name("test_cv.txt") |
| if not fixture_path.exists(): |
| return "" |
|
|
| raw_bytes = fixture_path.read_bytes() |
| for encoding in ("utf-16", "utf-16-le", "utf-8", "latin-1"): |
| try: |
| text = raw_bytes.decode(encoding) |
| cleaned = text.strip() |
| alpha_count = sum(1 for char in cleaned if char.isalpha()) |
| if len(cleaned) >= 20 and alpha_count >= 10 and "\x00" not in cleaned: |
| return text |
| except Exception: |
| continue |
|
|
| return "" |
|
|
|
|
| def _build_samples() -> List[BenchmarkSample]: |
| return [ |
| BenchmarkSample( |
| name="Structured English CV", |
| text=""" |
| JOHN SMITH |
| john.smith@example.com | +33 6 12 34 56 78 | linkedin.com/in/johnsmith |
| |
| PROFESSIONAL SUMMARY |
| Senior Full Stack Developer with 8 years of experience in web development. |
| |
| EXPERIENCE |
| Senior Developer - Tech Company Inc (2020-2024) |
| - Led team of 5 developers |
| - Built microservices using Python and FastAPI |
| - Managed PostgreSQL databases |
| |
| Junior Developer - Startup LLC (2016-2020) |
| - Developed React frontend applications |
| - Worked with Node.js backend |
| |
| EDUCATION |
| Bachelor of Science in Computer Science |
| University of Technology (2016) |
| |
| SKILLS |
| Python, JavaScript, TypeScript, SQL, HTML/CSS, FastAPI, React, Docker |
| """.strip(), |
| expected={"identity": 3, "experience": 2, "education": 2, "skills": 6, "enrichment": 1}, |
| ), |
| BenchmarkSample( |
| name="French CV with links", |
| text=""" |
| MARIE DUPONT |
| marie.dupont@gmail.com | 06 12 34 56 78 | Paris |
| linkedin.com/in/mariedupont | github.com/mdupont | mariedupont.dev |
| |
| PROFIL |
| Chef de projet digital orientée produit et expérience client. |
| |
| EXPÉRIENCES PROFESSIONNELLES |
| Responsable Marketing Digital - Entreprise X (2021 - Présent) |
| - Pilotage de campagnes multi-canaux |
| - Analyse des performances et reporting |
| |
| Chef de projet CRM - Société Y (2018 - 2021) |
| - Mise en place d'automatisations marketing |
| - Coordination avec les équipes produit et design |
| |
| FORMATION |
| Master Marketing Digital - Université de Lyon (2018) |
| |
| COMPÉTENCES |
| Communication, Organisation, Gestion de projet, Leadership, Sens du contact |
| |
| CERTIFICATIONS |
| Google Analytics Individual Qualification |
| HubSpot Inbound Marketing |
| |
| PROJETS |
| Refonte du parcours d'onboarding client |
| """.strip(), |
| expected={"identity": 3, "experience": 2, "education": 1, "skills": 4, "enrichment": 4}, |
| ), |
| BenchmarkSample( |
| name="OCR noisy CV", |
| text=""" |
| ALEXANDRE MARTIN |
| alex.martin@example.com |
| |
| EXPERIENCE PROFESSIONNELLE |
| 2022 - PRESENT | DATA ENGINEER | BLUE ANALYTICS |
| Built ETL pipelines on Airflow and Spark |
| Implemented data quality checks and dashboards |
| |
| 2020 - 2022 - BI ANALYST - RETAIL GROUP |
| Automated SQL reporting and Power BI models |
| |
| FORMATION |
| 2019 - Master Data Science - Paris School of AI |
| |
| LANGUES |
| French English |
| |
| COMPETENCES |
| Python, SQL, Airflow, Spark, Power BI, Communication |
| |
| PROJECTS |
| Customer churn prediction using Python and scikit-learn |
| """.strip(), |
| expected={"identity": 2, "experience": 2, "education": 1, "skills": 4, "enrichment": 2}, |
| ), |
| BenchmarkSample( |
| name="Minimal fallback CV", |
| text=""" |
| NOAH LEROY |
| noah.leroy@outlook.com |
| |
| Some short CV text with little structure. |
| Python, SQL, Docker. |
| """.strip(), |
| expected={"identity": 2, "experience": 0, "education": 0, "skills": 2, "enrichment": 0}, |
| ), |
| BenchmarkSample( |
| name="Ultra short CV", |
| text=""" |
| NADIA BENALI |
| nadia.benali@example.com |
| +33 6 98 76 54 32 |
| Paris, France |
| |
| Python | SQL | Data analysis |
| """.strip(), |
| expected={"identity": 3, "experience": 0, "education": 0, "skills": 2, "enrichment": 1}, |
| ), |
| ] |
|
|
|
|
| def _safe_len(value: Any) -> int: |
| if isinstance(value, list): |
| return len(value) |
| if value: |
| return 1 |
| return 0 |
|
|
|
|
| def _score_section(found: int, expected: int) -> float: |
| if expected <= 0: |
| return 100.0 if found == 0 else min(found * 25.0, 100.0) |
| return min(found / expected, 1.0) * 100.0 |
|
|
|
|
| def _build_section_scores(structured: Dict[str, Any]) -> Dict[str, float]: |
| identity_found = sum( |
| [ |
| 1 if structured.get("full_name") else 0, |
| 1 if structured.get("email") else 0, |
| 1 if structured.get("phone") else 0, |
| ] |
| ) |
| experience_found = _safe_len(structured.get("experiences")) |
| education_found = _safe_len(structured.get("education")) |
| skills_found = _safe_len(structured.get("skills")) |
| enrichment_found = sum( |
| [ |
| _safe_len(structured.get("linkedin_urls")), |
| _safe_len(structured.get("github_urls")), |
| _safe_len(structured.get("portfolio_urls")), |
| _safe_len(structured.get("certifications")), |
| _safe_len(structured.get("projects")), |
| _safe_len(structured.get("languages")), |
| _safe_len(structured.get("soft_skills")), |
| _safe_len(structured.get("interests")), |
| ] |
| ) |
|
|
| return { |
| "identity": float(identity_found), |
| "experience": float(experience_found), |
| "education": float(education_found), |
| "skills": float(skills_found), |
| "enrichment": float(enrichment_found), |
| } |
|
|
|
|
| def _overall_score(section_hits: Dict[str, float], expected: Dict[str, int]) -> float: |
| weights = { |
| "identity": 0.25, |
| "experience": 0.30, |
| "education": 0.15, |
| "skills": 0.15, |
| "enrichment": 0.15, |
| } |
|
|
| total = 0.0 |
| for section, weight in weights.items(): |
| total += weight * _score_section(int(section_hits[section]), expected.get(section, 0)) |
|
|
| return round(total, 1) |
|
|
|
|
| def _diagnose_missing_fields(sample: BenchmarkSample, structured: Dict[str, Any]) -> List[str]: |
| """Return human-readable reasons for missing fields in a given sample.""" |
| reasons: List[str] = [] |
| lines = [line.strip() for line in sample.text.splitlines() if line.strip()] |
| normalized_text = sample.text.lower() |
|
|
| if not structured.get("full_name"): |
| top_lines = lines[:5] |
| has_name_like_line = any( |
| 2 <= len(re.findall(r"[A-Za-zÀ-ÿ'-]+", line)) <= 4 and not re.search(r"[@\d]", line) |
| for line in top_lines |
| ) |
| if has_name_like_line: |
| reasons.append("Nom probable présent en haut du CV mais rejeté par les filtres de nom.") |
| elif structured.get("email"): |
| reasons.append("Nom absent mais un email est disponible: vérifier l'inférence depuis l'email.") |
| else: |
| reasons.append("Aucune ligne de nom claire détectée dans les premières lignes.") |
|
|
| if not structured.get("phone"): |
| has_phone_like_text = bool( |
| re.search(r"\+?\d[\d\s().-]{7,}\d", sample.text) |
| ) |
| if has_phone_like_text: |
| reasons.append("Un numéro semble présent mais n'a pas passé la normalisation téléphone.") |
| else: |
| reasons.append("Aucun motif téléphone suffisamment clair détecté.") |
|
|
| if not structured.get("experiences"): |
| if any(keyword in normalized_text for keyword in ("experience", "experiences", "professionnelle", "work experience", "stage")): |
| reasons.append("Section expérience détectée mais aucun bloc stable titre/entreprise/période n'a pu être construit.") |
| else: |
| reasons.append("Aucune section expérience ou ancre de période détectée.") |
|
|
| if not structured.get("education") and any(keyword in normalized_text for keyword in ("formation", "education", "study", "universit", "school")): |
| reasons.append("Section formation présente mais les lignes ne ressemblaient pas assez à de l'éducation.") |
|
|
| if not structured.get("skills"): |
| if any(token in normalized_text for token in ("python", "sql", "java", "react", "docker", "airflow", "spark")): |
| reasons.append("Des mots-clés techniques existent mais la normalisation a raté l'extraction de compétences.") |
| elif any(keyword in normalized_text for keyword in ("communication", "organisation", "leadership", "rigueur", "autonomie", "gestion de projet", "sens du contact")): |
| reasons.append("Le CV contient surtout des compétences génériques/soft skills; vérifier si elles doivent être reportées dans skills ou seulement dans soft_skills.") |
| else: |
| reasons.append("Aucune compétence technique évidente détectée.") |
|
|
| if not ( |
| structured.get("linkedin_urls") |
| or structured.get("github_urls") |
| or structured.get("portfolio_urls") |
| or structured.get("projects") |
| or structured.get("certifications") |
| ): |
| reasons.append("Aucun signal d'enrichissement (liens/projets/certifications) détecté.") |
|
|
| return reasons |
|
|
|
|
| def run_benchmark(diagnostic: bool = False) -> int: |
| service = CVExtractionService() |
| samples = _build_samples() |
|
|
| print("=" * 78) |
| print("Multi-CV Extraction Benchmark") |
| print("=" * 78) |
|
|
| aggregate: List[float] = [] |
|
|
| for index, sample in enumerate(samples, start=1): |
| result = service.extract_from_text(sample.text) |
| structured = result.structured |
| section_hits = _build_section_scores(structured) |
| overall = _overall_score(section_hits, sample.expected) |
| aggregate.append(overall) |
|
|
| print(f"\n[{index}] {sample.name}") |
| print(f" Overall: {overall:.1f}/100") |
| print( |
| " Sections: " |
| f"identity={_score_section(int(section_hits['identity']), sample.expected['identity']):.1f}, " |
| f"experience={_score_section(int(section_hits['experience']), sample.expected['experience']):.1f}, " |
| f"education={_score_section(int(section_hits['education']), sample.expected['education']):.1f}, " |
| f"skills={_score_section(int(section_hits['skills']), sample.expected['skills']):.1f}, " |
| f"enrichment={_score_section(int(section_hits['enrichment']), sample.expected['enrichment']):.1f}" |
| ) |
| print( |
| " Extracted: " |
| f"name={bool(structured.get('full_name'))}, " |
| f"email={bool(structured.get('email'))}, " |
| f"phone={bool(structured.get('phone'))}, " |
| f"experiences={len(structured.get('experiences', []))}, " |
| f"education={len(structured.get('education', []))}, " |
| f"skills={len(structured.get('skills', []))}, " |
| f"links={len(structured.get('linkedin_urls', [])) + len(structured.get('github_urls', [])) + len(structured.get('portfolio_urls', []))}, " |
| f"projects={len(structured.get('projects', []))}, " |
| f"certifications={len(structured.get('certifications', []))}" |
| ) |
|
|
| if diagnostic: |
| reasons = _diagnose_missing_fields(sample, structured) |
| if reasons: |
| print(" Diagnostics:") |
| for reason in reasons: |
| print(f" - {reason}") |
|
|
| average_score = round(sum(aggregate) / len(aggregate), 1) if aggregate else 0.0 |
| print("\n" + "=" * 78) |
| print(f"Average overall score: {average_score:.1f}/100") |
| print("=" * 78) |
|
|
| return 0 if average_score >= 70.0 else 1 |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser(description="Run the CV extraction benchmark") |
| parser.add_argument( |
| "--diagnostic", |
| action="store_true", |
| help="Print human-readable reasons for missing fields", |
| ) |
| args = parser.parse_args() |
| raise SystemExit(run_benchmark(diagnostic=args.diagnostic)) |