| """Benchmark baseline vs Siamese on CV-job pairs and propose decision thresholds. |
| |
| Usage: |
| source ../.venv-phase2/bin/activate |
| python scripts/benchmark_models.py \ |
| --data ../data/training_pairs.csv \ |
| --baseline ../models/baseline_model.joblib \ |
| --siamese ../models/siamese_model_phase2_full \ |
| --out-json ../reports/model_comparison.json |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
| from typing import Any |
|
|
| import joblib |
| import numpy as np |
| import pandas as pd |
| from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score |
| from sklearn.model_selection import train_test_split |
|
|
| script_dir = Path(__file__).resolve().parent |
| repo_root = script_dir.parent.parent |
| if str(repo_root / "backend") not in sys.path: |
| sys.path.insert(0, str(repo_root / "backend")) |
|
|
| from app.services.feature_engineering import PairFeatureMeta, build_pair_features |
| from app.services.normalization import normalize_text |
|
|
|
|
| def _load_baseline(path: Path) -> tuple[Any, dict[str, Any]]: |
| bundle = joblib.load(path) |
| return bundle["model"], bundle.get("meta") or {} |
|
|
|
|
| def _baseline_scores(model: Any, meta: dict[str, Any], cv_texts: list[str], job_texts: list[str]) -> np.ndarray: |
| if isinstance(meta, PairFeatureMeta): |
| feature_meta = meta |
| else: |
| feature_meta = PairFeatureMeta(tfidf=meta["tf"], svd=meta["svd"]) |
| X = np.vstack([build_pair_features(c, j, feature_meta) for c, j in zip(cv_texts, job_texts)]) |
|
|
| try: |
| scores = model.predict_proba(X)[:, 1] |
| except Exception: |
| try: |
| raw = model.decision_function(X) |
| scores = 1.0 / (1.0 + np.exp(-raw)) |
| except Exception: |
| scores = model.predict(X).astype(float) |
| return np.clip(scores, 0.0, 1.0) |
|
|
|
|
| def _siamese_scores(model_path: Path, cv_texts: list[str], job_texts: list[str]) -> np.ndarray: |
| from sentence_transformers import SentenceTransformer |
|
|
| model = SentenceTransformer(str(model_path)) |
| cv_emb = model.encode(cv_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False) |
| job_emb = model.encode(job_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False) |
| cosine = np.sum(cv_emb * job_emb, axis=1) |
| return np.clip(cosine, 0.0, 1.0) |
|
|
|
|
| def _metrics(y_true: np.ndarray, scores: np.ndarray, threshold: float = 0.5) -> dict[str, float]: |
| preds = (scores >= threshold).astype(int) |
| return { |
| "accuracy": float(accuracy_score(y_true, preds)), |
| "precision": float(precision_score(y_true, preds, zero_division=0)), |
| "recall": float(recall_score(y_true, preds, zero_division=0)), |
| "f1": float(f1_score(y_true, preds, zero_division=0)), |
| "roc_auc": float(roc_auc_score(y_true, scores)), |
| "threshold": float(threshold), |
| } |
|
|
|
|
| def _best_f1_threshold(y_true: np.ndarray, scores: np.ndarray) -> float: |
| candidates = np.linspace(0.0, 1.0, 101) |
| best_t = 0.5 |
| best_f1 = -1.0 |
| for t in candidates: |
| f1 = f1_score(y_true, (scores >= t).astype(int), zero_division=0) |
| if f1 > best_f1: |
| best_f1 = f1 |
| best_t = float(t) |
| return best_t |
|
|
|
|
| def _threshold_for_precision( |
| y_true: np.ndarray, |
| scores: np.ndarray, |
| target_precision: float, |
| fallback: float, |
| min_threshold: float, |
| ) -> float: |
| precision, recall, thresholds = precision_recall_curve(y_true, scores) |
| if len(thresholds) == 0: |
| return fallback |
|
|
| candidates: list[float] = [] |
| for idx, threshold in enumerate(thresholds): |
| p = precision[idx + 1] |
| if p >= target_precision and float(threshold) >= min_threshold: |
| candidates.append(float(threshold)) |
|
|
| if not candidates: |
| return fallback |
| return min(candidates) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--data", default=str(repo_root / "data" / "training_pairs.csv")) |
| parser.add_argument("--baseline", default=str(repo_root / "models" / "baseline_model.joblib")) |
| parser.add_argument("--siamese", default=str(repo_root / "models" / "siamese_model_phase2_full")) |
| parser.add_argument("--test-size", type=float, default=0.15) |
| parser.add_argument("--seed", type=int, default=42) |
| parser.add_argument("--out-json", default=str(repo_root / "reports" / "model_comparison.json")) |
| args = parser.parse_args() |
|
|
| df = pd.read_csv(args.data) |
| df["label"] = df["label"].astype(int) |
| df["cv_text"] = df["cv_text"].fillna("").astype(str).map(normalize_text) |
| df["job_text"] = df["job_text"].fillna("").astype(str).map(normalize_text) |
|
|
| train_df, test_df = train_test_split( |
| df, |
| test_size=args.test_size, |
| random_state=args.seed, |
| stratify=df["label"], |
| ) |
|
|
| y_test = test_df["label"].to_numpy() |
| cv_texts = test_df["cv_text"].tolist() |
| job_texts = test_df["job_text"].tolist() |
|
|
| baseline_model, baseline_meta = _load_baseline(Path(args.baseline)) |
| baseline_scores = _baseline_scores(baseline_model, baseline_meta, cv_texts, job_texts) |
|
|
| siamese_scores = _siamese_scores(Path(args.siamese), cv_texts, job_texts) |
|
|
| baseline_default = _metrics(y_test, baseline_scores, threshold=0.5) |
| siamese_default = _metrics(y_test, siamese_scores, threshold=0.5) |
|
|
| baseline_best_t = _best_f1_threshold(y_test, baseline_scores) |
| siamese_best_t = _best_f1_threshold(y_test, siamese_scores) |
|
|
| baseline_best = _metrics(y_test, baseline_scores, threshold=baseline_best_t) |
| siamese_best = _metrics(y_test, siamese_scores, threshold=siamese_best_t) |
|
|
| |
| selected_model = "baseline" if baseline_default["roc_auc"] >= siamese_default["roc_auc"] else "siamese" |
| selected_scores = baseline_scores if selected_model == "baseline" else siamese_scores |
|
|
| accept_t = _threshold_for_precision( |
| y_test, |
| selected_scores, |
| target_precision=0.90, |
| fallback=0.80, |
| min_threshold=0.50, |
| ) |
| review_t = _threshold_for_precision( |
| y_test, |
| selected_scores, |
| target_precision=0.70, |
| fallback=0.50, |
| min_threshold=0.30, |
| ) |
| review_t = float(min(review_t, accept_t - 0.05)) if accept_t > 0.05 else 0.50 |
|
|
| result = { |
| "dataset": { |
| "path": str(Path(args.data).resolve()), |
| "rows_total": int(len(df)), |
| "rows_train": int(len(train_df)), |
| "rows_test": int(len(test_df)), |
| "seed": int(args.seed), |
| "test_size": float(args.test_size), |
| }, |
| "baseline": { |
| "default_threshold_0_5": baseline_default, |
| "best_f1": baseline_best, |
| }, |
| "siamese": { |
| "default_threshold_0_5": siamese_default, |
| "best_f1": siamese_best, |
| "model_path": str(Path(args.siamese).resolve()), |
| }, |
| "production_recommendation": { |
| "model": selected_model, |
| "accept_threshold_score_pct": round(accept_t * 100.0, 2), |
| "review_threshold_score_pct": round(review_t * 100.0, 2), |
| "env": { |
| "MATCH_ACCEPT_THRESHOLD": str(round(accept_t * 100.0, 2)), |
| "MATCH_REVIEW_THRESHOLD": str(round(review_t * 100.0, 2)), |
| }, |
| }, |
| } |
|
|
| out_path = Path(args.out_json) |
| out_path.parent.mkdir(parents=True, exist_ok=True) |
| out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8") |
|
|
| print("=== Model comparison complete ===") |
| print(json.dumps(result, indent=2, ensure_ascii=False)) |
| print(f"Saved report to {out_path}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|