| import json |
|
|
| from fastapi import APIRouter, HTTPException, UploadFile, File, Form |
| from typing import Dict, Any, Optional |
|
|
| from app.services.cv_extractor import CVExtractionService |
| from app.services.feature_engineering import PairFeatureMeta, build_pair_features, fit_pair_vectorizer |
| from app.services.matching_service import MatchingService |
| from app.services.scoring import compute_match_score, apply_business_rules |
| from app.scoring.decision import combine_scores, decision_from_score |
| import joblib |
| import pickle |
| from pathlib import Path |
| import numpy as _np |
|
|
| router = APIRouter(prefix="/api/pipeline", tags=["pipeline"]) |
|
|
|
|
| def _load_bert_first_model_bundle(): |
| """Load the BERT-first pairwise scoring bundle if it exists.""" |
| models_dir = Path("models/classical_from_tfidf_bertfirst") |
| if not models_dir.exists(): |
| return None |
|
|
| meta_path = models_dir / "pair_feature_meta.joblib" |
| if not meta_path.exists(): |
| return None |
|
|
| try: |
| meta = joblib.load(meta_path) |
| except Exception: |
| return None |
|
|
| model = None |
| for filename in ("xgboost.model", "random_forest.joblib", "logistic.joblib"): |
| model_path = models_dir / filename |
| if model_path.exists(): |
| try: |
| model = joblib.load(model_path) |
| except Exception: |
| try: |
| import xgboost as xgb |
|
|
| model = xgb.Booster() |
| model.load_model(str(model_path)) |
| except Exception: |
| model = None |
| if model is not None: |
| break |
|
|
| if model is None: |
| return None |
|
|
| return {"model": model, "meta": meta} |
|
|
|
|
| @router.post("/run") |
| def run_pipeline(payload: Dict[str, Any]): |
| """Run full pipeline: extraction -> features -> matching -> scoring -> decision. |
| |
| Accepts either `candidate` (raw_text or fields) or `candidate_id` (not implemented), |
| and `job` (job_text + metadata). |
| """ |
| candidate = payload.get("candidate") |
| job = payload.get("job") |
| mode = payload.get("mode", "semantic") |
|
|
| if not candidate or not job: |
| raise HTTPException(status_code=400, detail="Both 'candidate' and 'job' are required in payload") |
|
|
| |
| extractor = CVExtractionService() |
| if isinstance(candidate, dict) and candidate.get("raw_text"): |
| extraction = extractor.extract_from_text(candidate.get("raw_text")) |
| else: |
| |
| extraction = extractor.extract_from_text(candidate.get("raw_text", "")) |
|
|
| cv_text = extraction.raw_text or "" |
| job_text = job.get("job_text") or job.get("description") or "" |
|
|
| |
| meta = fit_pair_vectorizer([cv_text], [job_text]) |
| features = build_pair_features(cv_text, job_text, meta) |
|
|
| |
| matcher = MatchingService() |
| if mode == "semantic": |
| sim = matcher.semantic_similarity(cv_text, job_text) |
| elif mode == "vector": |
| |
| sim = matcher.semantic_similarity(cv_text, job_text) |
| elif mode == "continuous": |
| cont = matcher.continuous_similarity(job_text, cv_text) |
| sim = cont.get("max", cont.get("mean", 0.0)) |
| else: |
| sim = matcher.deep_match_score(job_text, cv_text) |
|
|
| |
| cv_skills = extraction.structured.get("skills", []) if extraction.structured else [] |
| job_skills = job.get("skills", []) |
| cv_years = extraction.structured.get("years_experience", 0) if extraction.structured else 0 |
| job_years = job.get("years_experience", 0) |
|
|
| score = compute_match_score( |
| cv_skills=cv_skills, |
| job_skills=job_skills, |
| cv_years=cv_years, |
| job_years=job_years, |
| similarity_score=float(sim), |
| ) |
| |
| ml_score_pct = None |
| model_bundle = _load_bert_first_model_bundle() |
| if model_bundle is not None: |
| try: |
| clf = model_bundle["model"] |
| meta = model_bundle["meta"] |
| X = build_pair_features(cv_text, job_text, meta) |
| try: |
| prob = clf.predict_proba(X)[:, 1].ravel()[0] |
| ml_score_pct = float(prob * 100.0) |
| except Exception: |
| try: |
| pred = clf.predict(X).ravel()[0] |
| ml_score_pct = float(pred * 100.0) |
| except Exception: |
| ml_score_pct = None |
| except Exception: |
| ml_score_pct = None |
|
|
| |
| if ml_score_pct is None: |
| ml_score_pct = float(extraction.quality_score or 0) |
|
|
| |
| sim_pct = float(sim) * 100.0 if sim is not None else 0.0 |
| final_score = combine_scores(sim_pct, ml_score_pct, w_sim=0.5, w_ml=0.5) |
| decision_label, decision_meta = decision_from_score(final_score) |
|
|
| return { |
| "extraction": { |
| "quality_score": extraction.quality_score, |
| "structured": extraction.structured, |
| }, |
| "similarity": float(sim), |
| "ml_score": ml_score_pct, |
| "final_score": final_score, |
| "decision": {"label": decision_label, "meta": decision_meta}, |
| } |
|
|
|
|
| def np_dot_cosine(a, b): |
| import numpy as _np |
| na = _np.linalg.norm(a) |
| nb = _np.linalg.norm(b) |
| if na == 0 or nb == 0: |
| return 0.0 |
| return float(_np.dot(a, b) / (na * nb)) |
|
|
|
|
|
|
| @router.post("/run-full") |
| async def run_full(cv: UploadFile = File(...), job_json: str = Form(...)): |
| """Minimal endpoint: upload a CV file and a job JSON, return pipeline decision. |
| |
| This endpoint saves the uploaded CV to a temporary path, runs the existing |
| `run_pipeline` function by providing extracted raw text, and returns the same |
| output shape as `/run`. |
| """ |
| try: |
| job = json.loads(job_json) |
| except Exception: |
| raise HTTPException(status_code=400, detail="'job_json' must be valid JSON") |
|
|
| if not isinstance(job, dict): |
| raise HTTPException(status_code=400, detail="'job_json' must decode to an object") |
|
|
| from uuid import uuid4 |
| tmp_path = Path('/tmp') |
| tmp_path.mkdir(parents=True, exist_ok=True) |
| suffix = Path(cv.filename).suffix or '.pdf' |
| dest = tmp_path / f"uploaded_cv_{uuid4().hex}{suffix}" |
| content = await cv.read() |
| try: |
| dest.write_bytes(content) |
| except Exception as exc: |
| raise HTTPException(status_code=500, detail=f"Failed to save uploaded file: {exc}") |
|
|
| extractor = CVExtractionService() |
| try: |
| extraction = extractor.extract_from_pdf(str(dest)) |
| except Exception: |
| |
| text = content.decode('utf-8', errors='ignore') |
| extraction = extractor.extract_from_text(text) |
|
|
| payload = {"candidate": {"raw_text": extraction.raw_text or ""}, "job": job} |
| decision_payload = run_pipeline(payload) |
|
|
| matcher = MatchingService() |
| job_text = job.get("job_text") or job.get("description") or "" |
| top_k = int(job.get("top_k", 5)) |
| top_k_results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k) |
|
|
| return { |
| "decision": decision_payload, |
| "top_k": { |
| "job_text": job_text, |
| "top_k": top_k, |
| "results": top_k_results, |
| }, |
| } |
|
|
|
|
| @router.post("/top-k") |
| def top_k_candidates(payload: Dict[str, Any]): |
| """Return the top-K CV candidates for a job description using FAISS. |
| |
| Expected payload: |
| { |
| "job_text": "...", |
| "top_k": 5, |
| "index_dir": "models/faiss_index" |
| } |
| """ |
| job_text = payload.get("job_text") or payload.get("description") or "" |
| top_k = int(payload.get("top_k", 5)) |
| index_dir = payload.get("index_dir", "models/faiss_index") |
|
|
| if not job_text.strip(): |
| raise HTTPException(status_code=400, detail="'job_text' is required") |
|
|
| matcher = MatchingService() |
| results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k, index_dir=index_dir) |
|
|
| return { |
| "job_text": job_text, |
| "top_k": top_k, |
| "index_dir": index_dir, |
| "results": results, |
| } |
|
|