ai-talent-finder-backend / scripts /build_final_matching_artifacts.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
20 kB
"""Build the final matching artifacts for the project.
This script creates a reproducible pipeline that:
- exports a labeled dataset mixing real DB records and synthetic augmentation
- trains a supervised baseline model with train/test separation
- benchmarks it against the lightweight semantic matcher
- writes a final model bundle and a JSON report for the defense/demo
Usage:
/Users/elhadjibassirousy/Desktop/AI-Talent-Finder/.venv/bin/python \
backend/scripts/build_final_matching_artifacts.py \
--db backend/ai_talent_finder.db
"""
from __future__ import annotations
import argparse
import json
import random
import sqlite3
import sys
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split
script_dir = Path(__file__).resolve().parent
repo_root = script_dir.parent.parent
if str(repo_root / "backend") not in sys.path:
sys.path.insert(0, str(repo_root / "backend"))
from app.services.data_normalization import parse_experience_years
from app.services.feature_engineering import build_pair_features, fit_pair_vectorizer
from app.services.lightweight_siamese import get_siamese_matcher
from app.services.normalization import normalize_skill_name, normalize_text
from app.services.scoring import compute_match_score
from app.services.synthetic_data import SKILLS_POOL, generate_synthetic_candidate
@dataclass
class SplitMetrics:
accuracy: float
precision: float
recall: float
f1: float
roc_auc: float | None
threshold: float
def _ensure_parent(path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
def _normalize_skill_list(skills: list[str] | None) -> list[str]:
return [normalize_skill_name(skill) for skill in (skills or []) if normalize_skill_name(skill)]
def _candidate_text(record: dict[str, Any]) -> str:
parts: list[str] = [record.get("full_name") or "", record.get("raw_text") or ""]
parts.extend(record.get("skills", []))
parts.extend(record.get("companies", []))
parts.extend(record.get("job_titles", []))
parts.extend(record.get("education", []))
parts.extend(record.get("languages", []))
return normalize_text(" \n ".join(part for part in parts if part))
def _job_text(record: dict[str, Any]) -> str:
parts: list[str] = [record.get("title") or "", record.get("description") or ""]
parts.extend(record.get("required_skills", []))
parts.extend(record.get("languages_required", []))
parts.append(str(record.get("required_years") or ""))
return normalize_text(" \n ".join(part for part in parts if part))
def _load_real_data(db_path: Path) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cur = conn.cursor()
candidates: list[dict[str, Any]] = []
for row in cur.execute(
"""
SELECT id, full_name, email, raw_text, extraction_quality_score, ner_extraction_data,
extracted_job_titles, extracted_companies, extracted_education, is_fully_extracted
FROM candidates
ORDER BY created_at DESC
"""
).fetchall():
payload: dict[str, Any] = {}
if row["ner_extraction_data"]:
try:
payload = json.loads(row["ner_extraction_data"])
except Exception:
payload = {}
skills = _normalize_skill_list(payload.get("skills") or [])
companies = payload.get("companies") or []
job_titles = payload.get("job_titles") or []
education = payload.get("education") or []
languages = payload.get("languages") or []
experience_years = parse_experience_years(row["raw_text"] or "")
candidates.append(
{
"id": int(row["id"]),
"source": "real_db",
"full_name": row["full_name"] or "",
"email": row["email"] or "",
"raw_text": row["raw_text"] or "",
"skills": skills,
"companies": companies,
"job_titles": job_titles,
"education": education,
"languages": languages,
"experience_years": experience_years,
"quality_score": float(row["extraction_quality_score"] or 0.0),
"is_fully_extracted": bool(row["is_fully_extracted"]),
}
)
jobs: list[dict[str, Any]] = []
for row in cur.execute(
"""
SELECT jc.id, jc.title, jc.description, jc.created_at
FROM job_criteria jc
ORDER BY jc.created_at DESC
"""
).fetchall():
skill_rows = cur.execute(
"""
SELECT s.name
FROM criteria_skills cs
JOIN skills s ON s.id = cs.skill_id
WHERE cs.criteria_id = ?
ORDER BY cs.id ASC
""",
(row["id"],),
).fetchall()
required_skills = _normalize_skill_list([skill_row["name"] for skill_row in skill_rows])
jobs.append(
{
"id": int(row["id"]),
"source": "real_db",
"title": row["title"] or "",
"description": row["description"] or "",
"required_skills": required_skills,
"required_years": parse_experience_years((row["description"] or "") + " " + (row["title"] or "")),
"languages_required": [],
}
)
conn.close()
return candidates, jobs
def _heuristic_label(candidate: dict[str, Any], job: dict[str, Any]) -> tuple[int, float]:
candidate_skills = _normalize_skill_list(candidate.get("skills", []))
job_skills = _normalize_skill_list(job.get("required_skills", []))
candidate_years = int(candidate.get("experience_years") or parse_experience_years(candidate.get("raw_text", "")) or 0)
job_years = int(job.get("required_years") or parse_experience_years(job.get("description", "")) or 0)
intersection = set(candidate_skills) & set(job_skills)
union = set(candidate_skills) | set(job_skills)
semantic_similarity = len(intersection) / max(1, len(union))
score = compute_match_score(
cv_skills=candidate_skills,
job_skills=job_skills,
cv_years=candidate_years,
job_years=job_years,
cv_edu_level=2,
job_edu_level=2,
similarity_score=semantic_similarity,
)
label = 1 if score >= 0.60 else 0
return label, float(score)
def _rows_from_pairs(candidates: list[dict[str, Any]], jobs: list[dict[str, Any]], source: str) -> list[dict[str, Any]]:
rows: list[dict[str, Any]] = []
for candidate in candidates:
candidate_text = _candidate_text(candidate)
for job in jobs:
job_text = _job_text(job)
label, score = _heuristic_label(candidate, job)
rows.append(
{
"source": source,
"cv_id": candidate.get("id"),
"job_id": job.get("id"),
"candidate_name": candidate.get("full_name", ""),
"job_title": job.get("title", ""),
"cv_text": candidate_text,
"job_text": job_text,
"label": label,
"heuristic_score": round(score, 4),
}
)
return rows
def _synthetic_job_from_candidate(candidate: dict[str, Any], positive: bool, rng: random.Random, job_id: int) -> dict[str, Any]:
candidate_skills = _normalize_skill_list(candidate.get("skills", []))
if positive:
required_skills = candidate_skills[:]
if len(required_skills) > 4:
required_skills = rng.sample(required_skills, 4)
if not required_skills:
required_skills = [rng.choice(SKILLS_POOL)]
title = f"Senior {' '.join(required_skills[:2]).replace(' ', '')} Engineer"
description = (
f"Looking for a developer with strong skills in {', '.join(required_skills)} "
f"and {max(0, int(candidate.get('experience_years') or 0) - 1)}+ years of experience."
)
required_years = max(0, int(candidate.get("experience_years") or 0) - 1)
else:
disjoint_pool = [skill for skill in SKILLS_POOL if skill not in candidate_skills]
if len(disjoint_pool) < 3:
disjoint_pool = SKILLS_POOL[:]
required_skills = rng.sample(disjoint_pool, min(4, len(disjoint_pool)))
title = "Unrelated Engineer"
description = f"Looking for a profile with expertise in {', '.join(required_skills)}."
required_years = int(candidate.get("experience_years") or 0) + 3
return {
"id": job_id,
"source": "synthetic",
"title": title,
"description": description,
"required_skills": required_skills,
"required_years": required_years,
"languages_required": ["English"],
}
def _build_dataset(db_path: Path, synthetic_candidates: int, synthetic_jobs: int, seed: int) -> pd.DataFrame:
real_candidates, real_jobs = _load_real_data(db_path)
rows: list[dict[str, Any]] = []
if real_candidates and real_jobs:
rows.extend(_rows_from_pairs(real_candidates, real_jobs, source="real_db"))
rng = random.Random(seed)
synthetic_candidates_rows = []
for index in range(synthetic_candidates):
item = generate_synthetic_candidate(user_id=10_000 + index)
synthetic_candidates_rows.append(
{
**item,
"source": "synthetic",
"full_name": item.get("full_name", ""),
"email": item.get("email", ""),
"raw_text": " ".join(
[
item.get("full_name", ""),
" ".join(item.get("normalized_skills", [])),
str(item.get("experience_years", 0)),
item.get("education", ""),
" ".join(item.get("languages", [])),
]
),
"skills": item.get("normalized_skills", []),
"companies": [],
"job_titles": [],
"education": [item.get("education", "")],
"languages": item.get("languages", []),
}
)
synthetic_pairs: list[dict[str, Any]] = []
next_job_id = 20_000
for candidate in synthetic_candidates_rows:
positive_job = _synthetic_job_from_candidate(candidate, positive=True, rng=rng, job_id=next_job_id)
next_job_id += 1
negative_job = _synthetic_job_from_candidate(candidate, positive=False, rng=rng, job_id=next_job_id)
next_job_id += 1
for job, label in ((positive_job, 1), (negative_job, 0)):
job_text = _job_text(job)
candidate_text = _candidate_text(candidate)
_, score = _heuristic_label(candidate, job)
synthetic_pairs.append(
{
"source": "synthetic",
"cv_id": candidate.get("id"),
"job_id": job.get("id"),
"candidate_name": candidate.get("full_name", ""),
"job_title": job.get("title", ""),
"cv_text": candidate_text,
"job_text": job_text,
"label": label,
"heuristic_score": round(score, 4),
}
)
rows.extend(synthetic_pairs)
df = pd.DataFrame(rows)
if df.empty:
raise RuntimeError("No training rows could be built from the database or synthetic augmentation.")
df = df.drop_duplicates(subset=["cv_text", "job_text", "label"]).reset_index(drop=True)
return df
def _build_matrix(df: pd.DataFrame, meta) -> np.ndarray:
return np.vstack([
build_pair_features(str(row.cv_text), str(row.job_text), meta)
for row in df.itertuples(index=False)
])
def _train_model(X_train: np.ndarray, y_train: np.ndarray):
try:
from xgboost import XGBClassifier
model = XGBClassifier(
n_estimators=250,
max_depth=6,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.85,
eval_metric="logloss",
random_state=42,
)
model.fit(X_train, y_train)
model_name = "xgboost"
except Exception:
model = LogisticRegression(max_iter=2000, class_weight="balanced")
model.fit(X_train, y_train)
model_name = "logistic_regression"
return model, model_name
def _predict_scores(model, X: np.ndarray) -> np.ndarray:
try:
scores = model.predict_proba(X)[:, 1]
except Exception:
try:
raw_scores = model.decision_function(X)
scores = 1.0 / (1.0 + np.exp(-raw_scores))
except Exception:
scores = model.predict(X).astype(float)
return np.clip(scores.astype(float), 0.0, 1.0)
def _metrics(y_true: np.ndarray, scores: np.ndarray, threshold: float = 0.5) -> SplitMetrics:
preds = (scores >= threshold).astype(int)
roc_auc = None
try:
roc_auc = float(roc_auc_score(y_true, scores))
except Exception:
roc_auc = None
return SplitMetrics(
accuracy=float(accuracy_score(y_true, preds)),
precision=float(precision_score(y_true, preds, zero_division=0)),
recall=float(recall_score(y_true, preds, zero_division=0)),
f1=float(f1_score(y_true, preds, zero_division=0)),
roc_auc=roc_auc,
threshold=float(threshold),
)
def _best_f1_threshold(y_true: np.ndarray, scores: np.ndarray) -> float:
best_threshold = 0.5
best_f1 = -1.0
for threshold in np.linspace(0.0, 1.0, 101):
f1 = f1_score(y_true, (scores >= threshold).astype(int), zero_division=0)
if f1 > best_f1:
best_f1 = float(f1)
best_threshold = float(threshold)
return best_threshold
def _threshold_for_precision(y_true: np.ndarray, scores: np.ndarray, target_precision: float, fallback: float) -> float:
candidates = np.linspace(0.0, 1.0, 101)
best = fallback
for threshold in candidates:
predictions = (scores >= threshold).astype(int)
precision = precision_score(y_true, predictions, zero_division=0)
if precision >= target_precision:
best = float(threshold)
break
return float(best)
def build_and_train(db_path: Path, synthetic_candidates: int, synthetic_jobs: int, seed: int) -> dict[str, Any]:
df = _build_dataset(db_path, synthetic_candidates, synthetic_jobs, seed)
dataset_path = repo_root / "data" / "final_training_pairs.csv"
review_sample_path = repo_root / "data" / "final_training_review_sample.csv"
report_path = repo_root / "reports" / "advanced_matching_report.json"
model_path = repo_root / "models" / "final_match_model.joblib"
fallback_model_path = repo_root / "models" / "baseline_model.joblib"
_ensure_parent(dataset_path)
_ensure_parent(review_sample_path)
_ensure_parent(report_path)
_ensure_parent(model_path)
df.to_csv(dataset_path, index=False)
df.sample(min(200, len(df)), random_state=seed).to_csv(review_sample_path, index=False)
train_df, test_df = train_test_split(
df,
test_size=0.20,
random_state=seed,
stratify=df["label"],
)
meta = fit_pair_vectorizer(train_df["cv_text"].tolist(), train_df["job_text"].tolist())
X_train = _build_matrix(train_df, meta)
X_test = _build_matrix(test_df, meta)
y_train = train_df["label"].to_numpy()
y_test = test_df["label"].to_numpy()
model, model_name = _train_model(X_train, y_train)
train_scores = _predict_scores(model, X_train)
test_scores = _predict_scores(model, X_test)
train_metrics = _metrics(y_train, train_scores, threshold=0.5)
test_metrics = _metrics(y_test, test_scores, threshold=0.5)
best_threshold = _best_f1_threshold(y_test, test_scores)
best_metrics = _metrics(y_test, test_scores, threshold=best_threshold)
siamese = get_siamese_matcher()
semantic_scores = np.array([
siamese.compute_pair_similarity(row.cv_text, row.job_text) for row in test_df.itertuples(index=False)
], dtype=float)
semantic_metrics = _metrics(y_test, semantic_scores, threshold=0.5)
accept_threshold = max(
0.80,
_threshold_for_precision(y_test, test_scores, target_precision=0.90, fallback=0.80),
)
review_threshold = max(
0.50,
_threshold_for_precision(y_test, test_scores, target_precision=0.70, fallback=0.50),
)
review_threshold = float(min(review_threshold, max(0.0, accept_threshold - 0.05)))
positive_rate = float(df["label"].mean())
dataset_summary = {
"rows_total": int(len(df)),
"rows_train": int(len(train_df)),
"rows_test": int(len(test_df)),
"source_counts": df["source"].value_counts().to_dict(),
"positive_rate": positive_rate,
"seed": int(seed),
}
thresholds = {
"accept_pct": round(accept_threshold * 100.0, 2),
"review_pct": round(review_threshold * 100.0, 2),
}
bundle = {
"model": model,
"meta": meta,
"model_name": model_name,
"thresholds": thresholds,
"dataset_summary": dataset_summary,
"training_metrics": {
"train": asdict(train_metrics),
"test": asdict(test_metrics),
"test_best_f1": asdict(best_metrics),
"lightweight_semantic": asdict(semantic_metrics),
},
}
joblib.dump(bundle, model_path)
joblib.dump(bundle, fallback_model_path)
report = {
"dataset": dataset_summary,
"model": {
"name": model_name,
"path": str(model_path.resolve()),
"fallback_path": str(fallback_model_path.resolve()),
},
"metrics": bundle["training_metrics"],
"production_recommendation": {
"accept_threshold_score_pct": thresholds["accept_pct"],
"review_threshold_score_pct": thresholds["review_pct"],
"env": {
"MATCH_ACCEPT_THRESHOLD": str(thresholds["accept_pct"]),
"MATCH_REVIEW_THRESHOLD": str(thresholds["review_pct"]),
},
},
}
report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
return {
"dataset_path": str(dataset_path),
"review_sample_path": str(review_sample_path),
"model_path": str(model_path),
"fallback_model_path": str(fallback_model_path),
"report_path": str(report_path),
"report": report,
}
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--db", default=str(repo_root / "backend" / "ai_talent_finder.db"), help="SQLite DB path")
parser.add_argument("--synthetic-candidates", type=int, default=40)
parser.add_argument("--synthetic-jobs", type=int, default=10)
parser.add_argument("--seed", type=int, default=42)
args = parser.parse_args()
result = build_and_train(
Path(args.db),
synthetic_candidates=args.synthetic_candidates,
synthetic_jobs=args.synthetic_jobs,
seed=args.seed,
)
print("=== Final matching artifacts built ===")
print(json.dumps(result["report"], indent=2, ensure_ascii=False))
print(f"Dataset: {result['dataset_path']}")
print(f"Review sample: {result['review_sample_path']}")
print(f"Model bundle: {result['model_path']}")
print(f"Fallback bundle: {result['fallback_model_path']}")
print(f"Report: {result['report_path']}")
if __name__ == "__main__":
main()