Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Multi-CV extraction benchmark. | |
| Runs the CV extraction pipeline against several resume layouts and reports | |
| coverage scores per section so we can track robustness across formats. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import re | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Optional | |
| from app.services.cv_extractor import CVExtractionService | |
| class BenchmarkSample: | |
| name: str | |
| text: str | |
| expected: Dict[str, int] | |
| def _load_reference_text() -> str: | |
| """Best-effort loader for the existing test CV fixture.""" | |
| fixture_path = Path(__file__).with_name("test_cv.txt") | |
| if not fixture_path.exists(): | |
| return "" | |
| raw_bytes = fixture_path.read_bytes() | |
| for encoding in ("utf-16", "utf-16-le", "utf-8", "latin-1"): | |
| try: | |
| text = raw_bytes.decode(encoding) | |
| cleaned = text.strip() | |
| alpha_count = sum(1 for char in cleaned if char.isalpha()) | |
| if len(cleaned) >= 20 and alpha_count >= 10 and "\x00" not in cleaned: | |
| return text | |
| except Exception: | |
| continue | |
| return "" | |
| def _build_samples() -> List[BenchmarkSample]: | |
| return [ | |
| BenchmarkSample( | |
| name="Structured English CV", | |
| text=""" | |
| JOHN SMITH | |
| john.smith@example.com | +33 6 12 34 56 78 | linkedin.com/in/johnsmith | |
| PROFESSIONAL SUMMARY | |
| Senior Full Stack Developer with 8 years of experience in web development. | |
| EXPERIENCE | |
| Senior Developer - Tech Company Inc (2020-2024) | |
| - Led team of 5 developers | |
| - Built microservices using Python and FastAPI | |
| - Managed PostgreSQL databases | |
| Junior Developer - Startup LLC (2016-2020) | |
| - Developed React frontend applications | |
| - Worked with Node.js backend | |
| EDUCATION | |
| Bachelor of Science in Computer Science | |
| University of Technology (2016) | |
| SKILLS | |
| Python, JavaScript, TypeScript, SQL, HTML/CSS, FastAPI, React, Docker | |
| """.strip(), | |
| expected={"identity": 3, "experience": 2, "education": 2, "skills": 6, "enrichment": 1}, | |
| ), | |
| BenchmarkSample( | |
| name="French CV with links", | |
| text=""" | |
| MARIE DUPONT | |
| marie.dupont@gmail.com | 06 12 34 56 78 | Paris | |
| linkedin.com/in/mariedupont | github.com/mdupont | mariedupont.dev | |
| PROFIL | |
| Chef de projet digital orientée produit et expérience client. | |
| EXPÉRIENCES PROFESSIONNELLES | |
| Responsable Marketing Digital - Entreprise X (2021 - Présent) | |
| - Pilotage de campagnes multi-canaux | |
| - Analyse des performances et reporting | |
| Chef de projet CRM - Société Y (2018 - 2021) | |
| - Mise en place d'automatisations marketing | |
| - Coordination avec les équipes produit et design | |
| FORMATION | |
| Master Marketing Digital - Université de Lyon (2018) | |
| COMPÉTENCES | |
| Communication, Organisation, Gestion de projet, Leadership, Sens du contact | |
| CERTIFICATIONS | |
| Google Analytics Individual Qualification | |
| HubSpot Inbound Marketing | |
| PROJETS | |
| Refonte du parcours d'onboarding client | |
| """.strip(), | |
| expected={"identity": 3, "experience": 2, "education": 1, "skills": 4, "enrichment": 4}, | |
| ), | |
| BenchmarkSample( | |
| name="OCR noisy CV", | |
| text=""" | |
| ALEXANDRE MARTIN | |
| alex.martin@example.com | |
| EXPERIENCE PROFESSIONNELLE | |
| 2022 - PRESENT | DATA ENGINEER | BLUE ANALYTICS | |
| Built ETL pipelines on Airflow and Spark | |
| Implemented data quality checks and dashboards | |
| 2020 - 2022 - BI ANALYST - RETAIL GROUP | |
| Automated SQL reporting and Power BI models | |
| FORMATION | |
| 2019 - Master Data Science - Paris School of AI | |
| LANGUES | |
| French English | |
| COMPETENCES | |
| Python, SQL, Airflow, Spark, Power BI, Communication | |
| PROJECTS | |
| Customer churn prediction using Python and scikit-learn | |
| """.strip(), | |
| expected={"identity": 2, "experience": 2, "education": 1, "skills": 4, "enrichment": 2}, | |
| ), | |
| BenchmarkSample( | |
| name="Minimal fallback CV", | |
| text=""" | |
| NOAH LEROY | |
| noah.leroy@outlook.com | |
| Some short CV text with little structure. | |
| Python, SQL, Docker. | |
| """.strip(), | |
| expected={"identity": 2, "experience": 0, "education": 0, "skills": 2, "enrichment": 0}, | |
| ), | |
| BenchmarkSample( | |
| name="Ultra short CV", | |
| text=""" | |
| NADIA BENALI | |
| nadia.benali@example.com | |
| +33 6 98 76 54 32 | |
| Paris, France | |
| Python | SQL | Data analysis | |
| """.strip(), | |
| expected={"identity": 3, "experience": 0, "education": 0, "skills": 2, "enrichment": 1}, | |
| ), | |
| ] | |
| def _safe_len(value: Any) -> int: | |
| if isinstance(value, list): | |
| return len(value) | |
| if value: | |
| return 1 | |
| return 0 | |
| def _score_section(found: int, expected: int) -> float: | |
| if expected <= 0: | |
| return 100.0 if found == 0 else min(found * 25.0, 100.0) | |
| return min(found / expected, 1.0) * 100.0 | |
| def _build_section_scores(structured: Dict[str, Any]) -> Dict[str, float]: | |
| identity_found = sum( | |
| [ | |
| 1 if structured.get("full_name") else 0, | |
| 1 if structured.get("email") else 0, | |
| 1 if structured.get("phone") else 0, | |
| ] | |
| ) | |
| experience_found = _safe_len(structured.get("experiences")) | |
| education_found = _safe_len(structured.get("education")) | |
| skills_found = _safe_len(structured.get("skills")) | |
| enrichment_found = sum( | |
| [ | |
| _safe_len(structured.get("linkedin_urls")), | |
| _safe_len(structured.get("github_urls")), | |
| _safe_len(structured.get("portfolio_urls")), | |
| _safe_len(structured.get("certifications")), | |
| _safe_len(structured.get("projects")), | |
| _safe_len(structured.get("languages")), | |
| _safe_len(structured.get("soft_skills")), | |
| _safe_len(structured.get("interests")), | |
| ] | |
| ) | |
| return { | |
| "identity": float(identity_found), | |
| "experience": float(experience_found), | |
| "education": float(education_found), | |
| "skills": float(skills_found), | |
| "enrichment": float(enrichment_found), | |
| } | |
| def _overall_score(section_hits: Dict[str, float], expected: Dict[str, int]) -> float: | |
| weights = { | |
| "identity": 0.25, | |
| "experience": 0.30, | |
| "education": 0.15, | |
| "skills": 0.15, | |
| "enrichment": 0.15, | |
| } | |
| total = 0.0 | |
| for section, weight in weights.items(): | |
| total += weight * _score_section(int(section_hits[section]), expected.get(section, 0)) | |
| return round(total, 1) | |
| def _diagnose_missing_fields(sample: BenchmarkSample, structured: Dict[str, Any]) -> List[str]: | |
| """Return human-readable reasons for missing fields in a given sample.""" | |
| reasons: List[str] = [] | |
| lines = [line.strip() for line in sample.text.splitlines() if line.strip()] | |
| normalized_text = sample.text.lower() | |
| if not structured.get("full_name"): | |
| top_lines = lines[:5] | |
| has_name_like_line = any( | |
| 2 <= len(re.findall(r"[A-Za-zÀ-ÿ'-]+", line)) <= 4 and not re.search(r"[@\d]", line) | |
| for line in top_lines | |
| ) | |
| if has_name_like_line: | |
| reasons.append("Nom probable présent en haut du CV mais rejeté par les filtres de nom.") | |
| elif structured.get("email"): | |
| reasons.append("Nom absent mais un email est disponible: vérifier l'inférence depuis l'email.") | |
| else: | |
| reasons.append("Aucune ligne de nom claire détectée dans les premières lignes.") | |
| if not structured.get("phone"): | |
| has_phone_like_text = bool( | |
| re.search(r"\+?\d[\d\s().-]{7,}\d", sample.text) | |
| ) | |
| if has_phone_like_text: | |
| reasons.append("Un numéro semble présent mais n'a pas passé la normalisation téléphone.") | |
| else: | |
| reasons.append("Aucun motif téléphone suffisamment clair détecté.") | |
| if not structured.get("experiences"): | |
| if any(keyword in normalized_text for keyword in ("experience", "experiences", "professionnelle", "work experience", "stage")): | |
| reasons.append("Section expérience détectée mais aucun bloc stable titre/entreprise/période n'a pu être construit.") | |
| else: | |
| reasons.append("Aucune section expérience ou ancre de période détectée.") | |
| if not structured.get("education") and any(keyword in normalized_text for keyword in ("formation", "education", "study", "universit", "school")): | |
| reasons.append("Section formation présente mais les lignes ne ressemblaient pas assez à de l'éducation.") | |
| if not structured.get("skills"): | |
| if any(token in normalized_text for token in ("python", "sql", "java", "react", "docker", "airflow", "spark")): | |
| reasons.append("Des mots-clés techniques existent mais la normalisation a raté l'extraction de compétences.") | |
| elif any(keyword in normalized_text for keyword in ("communication", "organisation", "leadership", "rigueur", "autonomie", "gestion de projet", "sens du contact")): | |
| reasons.append("Le CV contient surtout des compétences génériques/soft skills; vérifier si elles doivent être reportées dans skills ou seulement dans soft_skills.") | |
| else: | |
| reasons.append("Aucune compétence technique évidente détectée.") | |
| if not ( | |
| structured.get("linkedin_urls") | |
| or structured.get("github_urls") | |
| or structured.get("portfolio_urls") | |
| or structured.get("projects") | |
| or structured.get("certifications") | |
| ): | |
| reasons.append("Aucun signal d'enrichissement (liens/projets/certifications) détecté.") | |
| return reasons | |
| def run_benchmark(diagnostic: bool = False) -> int: | |
| service = CVExtractionService() | |
| samples = _build_samples() | |
| print("=" * 78) | |
| print("Multi-CV Extraction Benchmark") | |
| print("=" * 78) | |
| aggregate: List[float] = [] | |
| for index, sample in enumerate(samples, start=1): | |
| result = service.extract_from_text(sample.text) | |
| structured = result.structured | |
| section_hits = _build_section_scores(structured) | |
| overall = _overall_score(section_hits, sample.expected) | |
| aggregate.append(overall) | |
| print(f"\n[{index}] {sample.name}") | |
| print(f" Overall: {overall:.1f}/100") | |
| print( | |
| " Sections: " | |
| f"identity={_score_section(int(section_hits['identity']), sample.expected['identity']):.1f}, " | |
| f"experience={_score_section(int(section_hits['experience']), sample.expected['experience']):.1f}, " | |
| f"education={_score_section(int(section_hits['education']), sample.expected['education']):.1f}, " | |
| f"skills={_score_section(int(section_hits['skills']), sample.expected['skills']):.1f}, " | |
| f"enrichment={_score_section(int(section_hits['enrichment']), sample.expected['enrichment']):.1f}" | |
| ) | |
| print( | |
| " Extracted: " | |
| f"name={bool(structured.get('full_name'))}, " | |
| f"email={bool(structured.get('email'))}, " | |
| f"phone={bool(structured.get('phone'))}, " | |
| f"experiences={len(structured.get('experiences', []))}, " | |
| f"education={len(structured.get('education', []))}, " | |
| f"skills={len(structured.get('skills', []))}, " | |
| f"links={len(structured.get('linkedin_urls', [])) + len(structured.get('github_urls', [])) + len(structured.get('portfolio_urls', []))}, " | |
| f"projects={len(structured.get('projects', []))}, " | |
| f"certifications={len(structured.get('certifications', []))}" | |
| ) | |
| if diagnostic: | |
| reasons = _diagnose_missing_fields(sample, structured) | |
| if reasons: | |
| print(" Diagnostics:") | |
| for reason in reasons: | |
| print(f" - {reason}") | |
| average_score = round(sum(aggregate) / len(aggregate), 1) if aggregate else 0.0 | |
| print("\n" + "=" * 78) | |
| print(f"Average overall score: {average_score:.1f}/100") | |
| print("=" * 78) | |
| return 0 if average_score >= 70.0 else 1 | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Run the CV extraction benchmark") | |
| parser.add_argument( | |
| "--diagnostic", | |
| action="store_true", | |
| help="Print human-readable reasons for missing fields", | |
| ) | |
| args = parser.parse_args() | |
| raise SystemExit(run_benchmark(diagnostic=args.diagnostic)) |