| |
| """ |
| Run a representative CV case suite for extraction quality checks. |
| Usage: |
| PYTHONPATH=. python backend/scripts/run_cv_case_suite.py |
| """ |
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| from typing import Dict, List |
|
|
| from app.services.cv_extractor import CVExtractionService |
|
|
| ROOT = Path(__file__).resolve().parents[2] |
| DEFAULT_CASES = ROOT / "backend" / "tests" / "fixtures" / "cv_cases.json" |
|
|
|
|
| def _load_cases(path: Path) -> List[Dict[str, object]]: |
| payload = json.loads(path.read_text(encoding="utf-8")) |
| return list(payload.get("cases", [])) |
|
|
|
|
| def _to_lower_set(values: List[str]) -> set[str]: |
| return {str(v).strip().lower() for v in values if str(v).strip()} |
|
|
|
|
| def _score_expected(found: List[str], expected: List[str]) -> float: |
| if not expected: |
| return 1.0 |
| found_lower = _to_lower_set(found) |
| expected_lower = _to_lower_set(expected) |
| if not expected_lower: |
| return 1.0 |
| matched = sum(1 for skill in expected_lower if skill in found_lower) |
| return matched / max(1, len(expected_lower)) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--cases", default=str(DEFAULT_CASES)) |
| parser.add_argument("--min-skill-match", type=float, default=0.5) |
| parser.add_argument("--strict", action="store_true") |
| args = parser.parse_args() |
|
|
| cases_path = Path(args.cases) |
| if not cases_path.exists(): |
| raise SystemExit(f"Cases file not found: {cases_path}") |
|
|
| service = CVExtractionService() |
| failures = 0 |
|
|
| cases = _load_cases(cases_path) |
| print(f"Running {len(cases)} CV cases from {cases_path}") |
| for case in cases: |
| case_id = case.get("id", "unknown") |
| print("\n---") |
| print(f"Case: {case_id}") |
|
|
| text = case.get("text") |
| pdf_path = case.get("pdf_path") |
| requires_ocr = bool(case.get("requires_ocr")) |
|
|
| if pdf_path: |
| pdf_path = str(pdf_path) |
| pdf_full = (ROOT / pdf_path).resolve() |
| if not pdf_full.exists(): |
| print(f"PDF missing: {pdf_full}") |
| if requires_ocr: |
| print("Skipping OCR-required case (no PDF provided).") |
| continue |
| result = service.extract_from_pdf(str(pdf_full)) |
| elif text: |
| result = service.extract_from_text(str(text)) |
| else: |
| print("No text or PDF provided for this case.") |
| continue |
|
|
| expected_skills = case.get("expected_skills", []) |
| expected_languages = case.get("expected_languages", []) |
| found_skills = [item.get("name") for item in result.skills if isinstance(item, dict)] |
| found_languages = [] |
| if isinstance(result.structured, dict): |
| found_languages = result.structured.get("languages") or [] |
|
|
| skill_match = _score_expected(found_skills, expected_skills) |
| lang_match = _score_expected(found_languages, expected_languages) |
|
|
| print(f"Quality score: {result.quality_score:.1f}") |
| print(f"Skills extracted: {len(found_skills)}") |
| print(f"Expected skills coverage: {skill_match:.2f}") |
| if expected_languages: |
| print(f"Expected language coverage: {lang_match:.2f}") |
|
|
| if skill_match < args.min_skill_match: |
| failures += 1 |
| print("WARN: skill coverage below threshold") |
|
|
| print("\nDone.") |
| if failures and args.strict: |
| raise SystemExit(2) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|