Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / app /api /pipeline.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 8 days ago

8.35 kB

	import json

	from fastapi import APIRouter, HTTPException, UploadFile, File, Form
	from typing import Dict, Any, Optional

	from app.services.cv_extractor import CVExtractionService
	from app.services.feature_engineering import PairFeatureMeta, build_pair_features, fit_pair_vectorizer
	from app.services.matching_service import MatchingService
	from app.services.scoring import compute_match_score, apply_business_rules
	from app.scoring.decision import combine_scores, decision_from_score
	import joblib
	import pickle
	from pathlib import Path
	import numpy as _np

	router = APIRouter(prefix="/api/pipeline", tags=["pipeline"])


	def _load_bert_first_model_bundle():
	"""Load the BERT-first pairwise scoring bundle if it exists."""
	models_dir = Path("models/classical_from_tfidf_bertfirst")
	if not models_dir.exists():
	return None

	meta_path = models_dir / "pair_feature_meta.joblib"
	if not meta_path.exists():
	return None

	try:
	meta = joblib.load(meta_path)
	except Exception:
	return None

	model = None
	for filename in ("xgboost.model", "random_forest.joblib", "logistic.joblib"):
	model_path = models_dir / filename
	if model_path.exists():
	try:
	model = joblib.load(model_path)
	except Exception:
	try:
	import xgboost as xgb

	model = xgb.Booster()
	model.load_model(str(model_path))
	except Exception:
	model = None
	if model is not None:
	break

	if model is None:
	return None

	return {"model": model, "meta": meta}


	@router.post("/run")
	def run_pipeline(payload: Dict[str, Any]):
	"""Run full pipeline: extraction -> features -> matching -> scoring -> decision.

	Accepts either `candidate` (raw_text or fields) or `candidate_id` (not implemented),
	and `job` (job_text + metadata).
	"""
	candidate = payload.get("candidate")
	job = payload.get("job")
	mode = payload.get("mode", "semantic")

	if not candidate or not job:
	raise HTTPException(status_code=400, detail="Both 'candidate' and 'job' are required in payload")

	# Extraction (if raw text provided)
	extractor = CVExtractionService()
	if isinstance(candidate, dict) and candidate.get("raw_text"):
	extraction = extractor.extract_from_text(candidate.get("raw_text"))
	else:
	# If structured candidate provided, build a lightweight extraction result
	extraction = extractor.extract_from_text(candidate.get("raw_text", ""))

	cv_text = extraction.raw_text or ""
	job_text = job.get("job_text") or job.get("description") or ""

	# Feature engineering: fit a small TF-IDF meta on-the-fly when needed
	meta = fit_pair_vectorizer([cv_text], [job_text])
	features = build_pair_features(cv_text, job_text, meta)

	# Matching
	matcher = MatchingService()
	if mode == "semantic":
	sim = matcher.semantic_similarity(cv_text, job_text)
	elif mode == "vector":
	# Keep the vector mode aligned with the BERT-first matching stack.
	sim = matcher.semantic_similarity(cv_text, job_text)
	elif mode == "continuous":
	cont = matcher.continuous_similarity(job_text, cv_text)
	sim = cont.get("max", cont.get("mean", 0.0))
	else:
	sim = matcher.deep_match_score(job_text, cv_text)

	# Scoring and decision
	cv_skills = extraction.structured.get("skills", []) if extraction.structured else []
	job_skills = job.get("skills", [])
	cv_years = extraction.structured.get("years_experience", 0) if extraction.structured else 0
	job_years = job.get("years_experience", 0)

	score = compute_match_score(
	cv_skills=cv_skills,
	job_skills=job_skills,
	cv_years=cv_years,
	job_years=job_years,
	similarity_score=float(sim),
	)
	# Try to augment with ML model score (if trained models present)
	ml_score_pct = None
	model_bundle = _load_bert_first_model_bundle()
	if model_bundle is not None:
	try:
	clf = model_bundle["model"]
	meta = model_bundle["meta"]
	X = build_pair_features(cv_text, job_text, meta)
	try:
	prob = clf.predict_proba(X)[:, 1].ravel()[0]
	ml_score_pct = float(prob * 100.0)
	except Exception:
	try:
	pred = clf.predict(X).ravel()[0]
	ml_score_pct = float(pred * 100.0)
	except Exception:
	ml_score_pct = None
	except Exception:
	ml_score_pct = None

	# ml_score_pct None -> map from heuristic of extraction quality
	if ml_score_pct is None:
	ml_score_pct = float(extraction.quality_score or 0)

	# combine similarity (0..1) -> convert to 0..100
	sim_pct = float(sim) * 100.0 if sim is not None else 0.0
	final_score = combine_scores(sim_pct, ml_score_pct, w_sim=0.5, w_ml=0.5)
	decision_label, decision_meta = decision_from_score(final_score)

	return {
	"extraction": {
	"quality_score": extraction.quality_score,
	"structured": extraction.structured,
	},
	"similarity": float(sim),
	"ml_score": ml_score_pct,
	"final_score": final_score,
	"decision": {"label": decision_label, "meta": decision_meta},
	}


	def np_dot_cosine(a, b):
	import numpy as _np
	na = _np.linalg.norm(a)
	nb = _np.linalg.norm(b)
	if na == 0 or nb == 0:
	return 0.0
	return float(_np.dot(a, b) / (na * nb))



	@router.post("/run-full")
	async def run_full(cv: UploadFile = File(...), job_json: str = Form(...)):
	"""Minimal endpoint: upload a CV file and a job JSON, return pipeline decision.

	This endpoint saves the uploaded CV to a temporary path, runs the existing
	`run_pipeline` function by providing extracted raw text, and returns the same
	output shape as `/run`.
	"""
	try:
	job = json.loads(job_json)
	except Exception:
	raise HTTPException(status_code=400, detail="'job_json' must be valid JSON")

	if not isinstance(job, dict):
	raise HTTPException(status_code=400, detail="'job_json' must decode to an object")

	from uuid import uuid4
	tmp_path = Path('/tmp')
	tmp_path.mkdir(parents=True, exist_ok=True)
	suffix = Path(cv.filename).suffix or '.pdf'
	dest = tmp_path / f"uploaded_cv_{uuid4().hex}{suffix}"
	content = await cv.read()
	try:
	dest.write_bytes(content)
	except Exception as exc:
	raise HTTPException(status_code=500, detail=f"Failed to save uploaded file: {exc}")

	extractor = CVExtractionService()
	try:
	extraction = extractor.extract_from_pdf(str(dest))
	except Exception:
	# fallback: try text extraction
	text = content.decode('utf-8', errors='ignore')
	extraction = extractor.extract_from_text(text)

	payload = {"candidate": {"raw_text": extraction.raw_text or ""}, "job": job}
	decision_payload = run_pipeline(payload)

	matcher = MatchingService()
	job_text = job.get("job_text") or job.get("description") or ""
	top_k = int(job.get("top_k", 5))
	top_k_results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k)

	return {
	"decision": decision_payload,
	"top_k": {
	"job_text": job_text,
	"top_k": top_k,
	"results": top_k_results,
	},
	}


	@router.post("/top-k")
	def top_k_candidates(payload: Dict[str, Any]):
	"""Return the top-K CV candidates for a job description using FAISS.

	Expected payload:
	{
	"job_text": "...",
	"top_k": 5,
	"index_dir": "models/faiss_index"
	}
	"""
	job_text = payload.get("job_text") or payload.get("description") or ""
	top_k = int(payload.get("top_k", 5))
	index_dir = payload.get("index_dir", "models/faiss_index")

	if not job_text.strip():
	raise HTTPException(status_code=400, detail="'job_text' is required")

	matcher = MatchingService()
	results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k, index_dir=index_dir)

	return {
	"job_text": job_text,
	"top_k": top_k,
	"index_dir": index_dir,
	"results": results,
	}