Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / scripts /benchmark_models.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 9 days ago

7.78 kB

	"""Benchmark baseline vs Siamese on CV-job pairs and propose decision thresholds.

	Usage:
	source ../.venv-phase2/bin/activate
	python scripts/benchmark_models.py \
	--data ../data/training_pairs.csv \
	--baseline ../models/baseline_model.joblib \
	--siamese ../models/siamese_model_phase2_full \
	--out-json ../reports/model_comparison.json
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from pathlib import Path
	from typing import Any

	import joblib
	import numpy as np
	import pandas as pd
	from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score
	from sklearn.model_selection import train_test_split

	script_dir = Path(__file__).resolve().parent
	repo_root = script_dir.parent.parent
	if str(repo_root / "backend") not in sys.path:
	sys.path.insert(0, str(repo_root / "backend"))

	from app.services.feature_engineering import PairFeatureMeta, build_pair_features # noqa: E402
	from app.services.normalization import normalize_text # noqa: E402


	def _load_baseline(path: Path) -> tuple[Any, dict[str, Any]]:
	bundle = joblib.load(path)
	return bundle["model"], bundle.get("meta") or {}


	def _baseline_scores(model: Any, meta: dict[str, Any], cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
	if isinstance(meta, PairFeatureMeta):
	feature_meta = meta
	else:
	feature_meta = PairFeatureMeta(tfidf=meta["tf"], svd=meta["svd"])
	X = np.vstack([build_pair_features(c, j, feature_meta) for c, j in zip(cv_texts, job_texts)])

	try:
	scores = model.predict_proba(X)[:, 1]
	except Exception:
	try:
	raw = model.decision_function(X)
	scores = 1.0 / (1.0 + np.exp(-raw))
	except Exception:
	scores = model.predict(X).astype(float)
	return np.clip(scores, 0.0, 1.0)


	def _siamese_scores(model_path: Path, cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
	from sentence_transformers import SentenceTransformer

	model = SentenceTransformer(str(model_path))
	cv_emb = model.encode(cv_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
	job_emb = model.encode(job_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
	cosine = np.sum(cv_emb * job_emb, axis=1)
	return np.clip(cosine, 0.0, 1.0)


	def _metrics(y_true: np.ndarray, scores: np.ndarray, threshold: float = 0.5) -> dict[str, float]:
	preds = (scores >= threshold).astype(int)
	return {
	"accuracy": float(accuracy_score(y_true, preds)),
	"precision": float(precision_score(y_true, preds, zero_division=0)),
	"recall": float(recall_score(y_true, preds, zero_division=0)),
	"f1": float(f1_score(y_true, preds, zero_division=0)),
	"roc_auc": float(roc_auc_score(y_true, scores)),
	"threshold": float(threshold),
	}


	def _best_f1_threshold(y_true: np.ndarray, scores: np.ndarray) -> float:
	candidates = np.linspace(0.0, 1.0, 101)
	best_t = 0.5
	best_f1 = -1.0
	for t in candidates:
	f1 = f1_score(y_true, (scores >= t).astype(int), zero_division=0)
	if f1 > best_f1:
	best_f1 = f1
	best_t = float(t)
	return best_t


	def _threshold_for_precision(
	y_true: np.ndarray,
	scores: np.ndarray,
	target_precision: float,
	fallback: float,
	min_threshold: float,
	) -> float:
	precision, recall, thresholds = precision_recall_curve(y_true, scores)
	if len(thresholds) == 0:
	return fallback

	candidates: list[float] = []
	for idx, threshold in enumerate(thresholds):
	p = precision[idx + 1]
	if p >= target_precision and float(threshold) >= min_threshold:
	candidates.append(float(threshold))

	if not candidates:
	return fallback
	return min(candidates)


	def main() -> None:
	parser = argparse.ArgumentParser()
	parser.add_argument("--data", default=str(repo_root / "data" / "training_pairs.csv"))
	parser.add_argument("--baseline", default=str(repo_root / "models" / "baseline_model.joblib"))
	parser.add_argument("--siamese", default=str(repo_root / "models" / "siamese_model_phase2_full"))
	parser.add_argument("--test-size", type=float, default=0.15)
	parser.add_argument("--seed", type=int, default=42)
	parser.add_argument("--out-json", default=str(repo_root / "reports" / "model_comparison.json"))
	args = parser.parse_args()

	df = pd.read_csv(args.data)
	df["label"] = df["label"].astype(int)
	df["cv_text"] = df["cv_text"].fillna("").astype(str).map(normalize_text)
	df["job_text"] = df["job_text"].fillna("").astype(str).map(normalize_text)

	train_df, test_df = train_test_split(
	df,
	test_size=args.test_size,
	random_state=args.seed,
	stratify=df["label"],
	)

	y_test = test_df["label"].to_numpy()
	cv_texts = test_df["cv_text"].tolist()
	job_texts = test_df["job_text"].tolist()

	baseline_model, baseline_meta = _load_baseline(Path(args.baseline))
	baseline_scores = _baseline_scores(baseline_model, baseline_meta, cv_texts, job_texts)

	siamese_scores = _siamese_scores(Path(args.siamese), cv_texts, job_texts)

	baseline_default = _metrics(y_test, baseline_scores, threshold=0.5)
	siamese_default = _metrics(y_test, siamese_scores, threshold=0.5)

	baseline_best_t = _best_f1_threshold(y_test, baseline_scores)
	siamese_best_t = _best_f1_threshold(y_test, siamese_scores)

	baseline_best = _metrics(y_test, baseline_scores, threshold=baseline_best_t)
	siamese_best = _metrics(y_test, siamese_scores, threshold=siamese_best_t)

	# Business calibration from the best default ROC-AUC model.
	selected_model = "baseline" if baseline_default["roc_auc"] >= siamese_default["roc_auc"] else "siamese"
	selected_scores = baseline_scores if selected_model == "baseline" else siamese_scores

	accept_t = _threshold_for_precision(
	y_test,
	selected_scores,
	target_precision=0.90,
	fallback=0.80,
	min_threshold=0.50,
	)
	review_t = _threshold_for_precision(
	y_test,
	selected_scores,
	target_precision=0.70,
	fallback=0.50,
	min_threshold=0.30,
	)
	review_t = float(min(review_t, accept_t - 0.05)) if accept_t > 0.05 else 0.50

	result = {
	"dataset": {
	"path": str(Path(args.data).resolve()),
	"rows_total": int(len(df)),
	"rows_train": int(len(train_df)),
	"rows_test": int(len(test_df)),
	"seed": int(args.seed),
	"test_size": float(args.test_size),
	},
	"baseline": {
	"default_threshold_0_5": baseline_default,
	"best_f1": baseline_best,
	},
	"siamese": {
	"default_threshold_0_5": siamese_default,
	"best_f1": siamese_best,
	"model_path": str(Path(args.siamese).resolve()),
	},
	"production_recommendation": {
	"model": selected_model,
	"accept_threshold_score_pct": round(accept_t * 100.0, 2),
	"review_threshold_score_pct": round(review_t * 100.0, 2),
	"env": {
	"MATCH_ACCEPT_THRESHOLD": str(round(accept_t * 100.0, 2)),
	"MATCH_REVIEW_THRESHOLD": str(round(review_t * 100.0, 2)),
	},
	},
	}

	out_path = Path(args.out_json)
	out_path.parent.mkdir(parents=True, exist_ok=True)
	out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")

	print("=== Model comparison complete ===")
	print(json.dumps(result, indent=2, ensure_ascii=False))
	print(f"Saved report to {out_path}")


	if __name__ == "__main__":
	main()