ai-talent-finder-backend / scripts /benchmark_models.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
7.78 kB
"""Benchmark baseline vs Siamese on CV-job pairs and propose decision thresholds.
Usage:
source ../.venv-phase2/bin/activate
python scripts/benchmark_models.py \
--data ../data/training_pairs.csv \
--baseline ../models/baseline_model.joblib \
--siamese ../models/siamese_model_phase2_full \
--out-json ../reports/model_comparison.json
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Any
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
script_dir = Path(__file__).resolve().parent
repo_root = script_dir.parent.parent
if str(repo_root / "backend") not in sys.path:
sys.path.insert(0, str(repo_root / "backend"))
from app.services.feature_engineering import PairFeatureMeta, build_pair_features # noqa: E402
from app.services.normalization import normalize_text # noqa: E402
def _load_baseline(path: Path) -> tuple[Any, dict[str, Any]]:
bundle = joblib.load(path)
return bundle["model"], bundle.get("meta") or {}
def _baseline_scores(model: Any, meta: dict[str, Any], cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
if isinstance(meta, PairFeatureMeta):
feature_meta = meta
else:
feature_meta = PairFeatureMeta(tfidf=meta["tf"], svd=meta["svd"])
X = np.vstack([build_pair_features(c, j, feature_meta) for c, j in zip(cv_texts, job_texts)])
try:
scores = model.predict_proba(X)[:, 1]
except Exception:
try:
raw = model.decision_function(X)
scores = 1.0 / (1.0 + np.exp(-raw))
except Exception:
scores = model.predict(X).astype(float)
return np.clip(scores, 0.0, 1.0)
def _siamese_scores(model_path: Path, cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer(str(model_path))
cv_emb = model.encode(cv_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
job_emb = model.encode(job_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
cosine = np.sum(cv_emb * job_emb, axis=1)
return np.clip(cosine, 0.0, 1.0)
def _metrics(y_true: np.ndarray, scores: np.ndarray, threshold: float = 0.5) -> dict[str, float]:
preds = (scores >= threshold).astype(int)
return {
"accuracy": float(accuracy_score(y_true, preds)),
"precision": float(precision_score(y_true, preds, zero_division=0)),
"recall": float(recall_score(y_true, preds, zero_division=0)),
"f1": float(f1_score(y_true, preds, zero_division=0)),
"roc_auc": float(roc_auc_score(y_true, scores)),
"threshold": float(threshold),
}
def _best_f1_threshold(y_true: np.ndarray, scores: np.ndarray) -> float:
candidates = np.linspace(0.0, 1.0, 101)
best_t = 0.5
best_f1 = -1.0
for t in candidates:
f1 = f1_score(y_true, (scores >= t).astype(int), zero_division=0)
if f1 > best_f1:
best_f1 = f1
best_t = float(t)
return best_t
def _threshold_for_precision(
y_true: np.ndarray,
scores: np.ndarray,
target_precision: float,
fallback: float,
min_threshold: float,
) -> float:
precision, recall, thresholds = precision_recall_curve(y_true, scores)
if len(thresholds) == 0:
return fallback
candidates: list[float] = []
for idx, threshold in enumerate(thresholds):
p = precision[idx + 1]
if p >= target_precision and float(threshold) >= min_threshold:
candidates.append(float(threshold))
if not candidates:
return fallback
return min(candidates)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--data", default=str(repo_root / "data" / "training_pairs.csv"))
parser.add_argument("--baseline", default=str(repo_root / "models" / "baseline_model.joblib"))
parser.add_argument("--siamese", default=str(repo_root / "models" / "siamese_model_phase2_full"))
parser.add_argument("--test-size", type=float, default=0.15)
parser.add_argument("--seed", type=int, default=42)
parser.add_argument("--out-json", default=str(repo_root / "reports" / "model_comparison.json"))
args = parser.parse_args()
df = pd.read_csv(args.data)
df["label"] = df["label"].astype(int)
df["cv_text"] = df["cv_text"].fillna("").astype(str).map(normalize_text)
df["job_text"] = df["job_text"].fillna("").astype(str).map(normalize_text)
train_df, test_df = train_test_split(
df,
test_size=args.test_size,
random_state=args.seed,
stratify=df["label"],
)
y_test = test_df["label"].to_numpy()
cv_texts = test_df["cv_text"].tolist()
job_texts = test_df["job_text"].tolist()
baseline_model, baseline_meta = _load_baseline(Path(args.baseline))
baseline_scores = _baseline_scores(baseline_model, baseline_meta, cv_texts, job_texts)
siamese_scores = _siamese_scores(Path(args.siamese), cv_texts, job_texts)
baseline_default = _metrics(y_test, baseline_scores, threshold=0.5)
siamese_default = _metrics(y_test, siamese_scores, threshold=0.5)
baseline_best_t = _best_f1_threshold(y_test, baseline_scores)
siamese_best_t = _best_f1_threshold(y_test, siamese_scores)
baseline_best = _metrics(y_test, baseline_scores, threshold=baseline_best_t)
siamese_best = _metrics(y_test, siamese_scores, threshold=siamese_best_t)
# Business calibration from the best default ROC-AUC model.
selected_model = "baseline" if baseline_default["roc_auc"] >= siamese_default["roc_auc"] else "siamese"
selected_scores = baseline_scores if selected_model == "baseline" else siamese_scores
accept_t = _threshold_for_precision(
y_test,
selected_scores,
target_precision=0.90,
fallback=0.80,
min_threshold=0.50,
)
review_t = _threshold_for_precision(
y_test,
selected_scores,
target_precision=0.70,
fallback=0.50,
min_threshold=0.30,
)
review_t = float(min(review_t, accept_t - 0.05)) if accept_t > 0.05 else 0.50
result = {
"dataset": {
"path": str(Path(args.data).resolve()),
"rows_total": int(len(df)),
"rows_train": int(len(train_df)),
"rows_test": int(len(test_df)),
"seed": int(args.seed),
"test_size": float(args.test_size),
},
"baseline": {
"default_threshold_0_5": baseline_default,
"best_f1": baseline_best,
},
"siamese": {
"default_threshold_0_5": siamese_default,
"best_f1": siamese_best,
"model_path": str(Path(args.siamese).resolve()),
},
"production_recommendation": {
"model": selected_model,
"accept_threshold_score_pct": round(accept_t * 100.0, 2),
"review_threshold_score_pct": round(review_t * 100.0, 2),
"env": {
"MATCH_ACCEPT_THRESHOLD": str(round(accept_t * 100.0, 2)),
"MATCH_REVIEW_THRESHOLD": str(round(review_t * 100.0, 2)),
},
},
}
out_path = Path(args.out_json)
out_path.parent.mkdir(parents=True, exist_ok=True)
out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
print("=== Model comparison complete ===")
print(json.dumps(result, indent=2, ensure_ascii=False))
print(f"Saved report to {out_path}")
if __name__ == "__main__":
main()