File size: 3,525 Bytes
9df97a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | #!/usr/bin/env python3
"""
Run a representative CV case suite for extraction quality checks.
Usage:
PYTHONPATH=. python backend/scripts/run_cv_case_suite.py
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import Dict, List
from app.services.cv_extractor import CVExtractionService
ROOT = Path(__file__).resolve().parents[2]
DEFAULT_CASES = ROOT / "backend" / "tests" / "fixtures" / "cv_cases.json"
def _load_cases(path: Path) -> List[Dict[str, object]]:
payload = json.loads(path.read_text(encoding="utf-8"))
return list(payload.get("cases", []))
def _to_lower_set(values: List[str]) -> set[str]:
return {str(v).strip().lower() for v in values if str(v).strip()}
def _score_expected(found: List[str], expected: List[str]) -> float:
if not expected:
return 1.0
found_lower = _to_lower_set(found)
expected_lower = _to_lower_set(expected)
if not expected_lower:
return 1.0
matched = sum(1 for skill in expected_lower if skill in found_lower)
return matched / max(1, len(expected_lower))
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--cases", default=str(DEFAULT_CASES))
parser.add_argument("--min-skill-match", type=float, default=0.5)
parser.add_argument("--strict", action="store_true")
args = parser.parse_args()
cases_path = Path(args.cases)
if not cases_path.exists():
raise SystemExit(f"Cases file not found: {cases_path}")
service = CVExtractionService()
failures = 0
cases = _load_cases(cases_path)
print(f"Running {len(cases)} CV cases from {cases_path}")
for case in cases:
case_id = case.get("id", "unknown")
print("\n---")
print(f"Case: {case_id}")
text = case.get("text")
pdf_path = case.get("pdf_path")
requires_ocr = bool(case.get("requires_ocr"))
if pdf_path:
pdf_path = str(pdf_path)
pdf_full = (ROOT / pdf_path).resolve()
if not pdf_full.exists():
print(f"PDF missing: {pdf_full}")
if requires_ocr:
print("Skipping OCR-required case (no PDF provided).")
continue
result = service.extract_from_pdf(str(pdf_full))
elif text:
result = service.extract_from_text(str(text))
else:
print("No text or PDF provided for this case.")
continue
expected_skills = case.get("expected_skills", [])
expected_languages = case.get("expected_languages", [])
found_skills = [item.get("name") for item in result.skills if isinstance(item, dict)]
found_languages = []
if isinstance(result.structured, dict):
found_languages = result.structured.get("languages") or []
skill_match = _score_expected(found_skills, expected_skills)
lang_match = _score_expected(found_languages, expected_languages)
print(f"Quality score: {result.quality_score:.1f}")
print(f"Skills extracted: {len(found_skills)}")
print(f"Expected skills coverage: {skill_match:.2f}")
if expected_languages:
print(f"Expected language coverage: {lang_match:.2f}")
if skill_match < args.min_skill_match:
failures += 1
print("WARN: skill coverage below threshold")
print("\nDone.")
if failures and args.strict:
raise SystemExit(2)
if __name__ == "__main__":
main()
|