Spaces:

RHmaster
/

ai-talent-finder-backend

Running

File size: 8,350 Bytes

9df97a2

import json

from fastapi import APIRouter, HTTPException, UploadFile, File, Form
from typing import Dict, Any, Optional

from app.services.cv_extractor import CVExtractionService
from app.services.feature_engineering import PairFeatureMeta, build_pair_features, fit_pair_vectorizer
from app.services.matching_service import MatchingService
from app.services.scoring import compute_match_score, apply_business_rules
from app.scoring.decision import combine_scores, decision_from_score
import joblib
import pickle
from pathlib import Path
import numpy as _np

router = APIRouter(prefix="/api/pipeline", tags=["pipeline"])


def _load_bert_first_model_bundle():
    """Load the BERT-first pairwise scoring bundle if it exists."""
    models_dir = Path("models/classical_from_tfidf_bertfirst")
    if not models_dir.exists():
        return None

    meta_path = models_dir / "pair_feature_meta.joblib"
    if not meta_path.exists():
        return None

    try:
        meta = joblib.load(meta_path)
    except Exception:
        return None

    model = None
    for filename in ("xgboost.model", "random_forest.joblib", "logistic.joblib"):
        model_path = models_dir / filename
        if model_path.exists():
            try:
                model = joblib.load(model_path)
            except Exception:
                try:
                    import xgboost as xgb

                    model = xgb.Booster()
                    model.load_model(str(model_path))
                except Exception:
                    model = None
            if model is not None:
                break

    if model is None:
        return None

    return {"model": model, "meta": meta}


@router.post("/run")
def run_pipeline(payload: Dict[str, Any]):
    """Run full pipeline: extraction -> features -> matching -> scoring -> decision.

    Accepts either `candidate` (raw_text or fields) or `candidate_id` (not implemented),
    and `job` (job_text + metadata).
    """
    candidate = payload.get("candidate")
    job = payload.get("job")
    mode = payload.get("mode", "semantic")

    if not candidate or not job:
        raise HTTPException(status_code=400, detail="Both 'candidate' and 'job' are required in payload")

    # Extraction (if raw text provided)
    extractor = CVExtractionService()
    if isinstance(candidate, dict) and candidate.get("raw_text"):
        extraction = extractor.extract_from_text(candidate.get("raw_text"))
    else:
        # If structured candidate provided, build a lightweight extraction result
        extraction = extractor.extract_from_text(candidate.get("raw_text", ""))

    cv_text = extraction.raw_text or ""
    job_text = job.get("job_text") or job.get("description") or ""

    # Feature engineering: fit a small TF-IDF meta on-the-fly when needed
    meta = fit_pair_vectorizer([cv_text], [job_text])
    features = build_pair_features(cv_text, job_text, meta)

    # Matching
    matcher = MatchingService()
    if mode == "semantic":
        sim = matcher.semantic_similarity(cv_text, job_text)
    elif mode == "vector":
        # Keep the vector mode aligned with the BERT-first matching stack.
        sim = matcher.semantic_similarity(cv_text, job_text)
    elif mode == "continuous":
        cont = matcher.continuous_similarity(job_text, cv_text)
        sim = cont.get("max", cont.get("mean", 0.0))
    else:
        sim = matcher.deep_match_score(job_text, cv_text)

    # Scoring and decision
    cv_skills = extraction.structured.get("skills", []) if extraction.structured else []
    job_skills = job.get("skills", [])
    cv_years = extraction.structured.get("years_experience", 0) if extraction.structured else 0
    job_years = job.get("years_experience", 0)

    score = compute_match_score(
        cv_skills=cv_skills,
        job_skills=job_skills,
        cv_years=cv_years,
        job_years=job_years,
        similarity_score=float(sim),
    )
    # Try to augment with ML model score (if trained models present)
    ml_score_pct = None
    model_bundle = _load_bert_first_model_bundle()
    if model_bundle is not None:
        try:
            clf = model_bundle["model"]
            meta = model_bundle["meta"]
            X = build_pair_features(cv_text, job_text, meta)
            try:
                prob = clf.predict_proba(X)[:, 1].ravel()[0]
                ml_score_pct = float(prob * 100.0)
            except Exception:
                try:
                    pred = clf.predict(X).ravel()[0]
                    ml_score_pct = float(pred * 100.0)
                except Exception:
                    ml_score_pct = None
        except Exception:
            ml_score_pct = None

    # ml_score_pct None -> map from heuristic of extraction quality
    if ml_score_pct is None:
        ml_score_pct = float(extraction.quality_score or 0)

    # combine similarity (0..1) -> convert to 0..100
    sim_pct = float(sim) * 100.0 if sim is not None else 0.0
    final_score = combine_scores(sim_pct, ml_score_pct, w_sim=0.5, w_ml=0.5)
    decision_label, decision_meta = decision_from_score(final_score)

    return {
        "extraction": {
            "quality_score": extraction.quality_score,
            "structured": extraction.structured,
        },
        "similarity": float(sim),
        "ml_score": ml_score_pct,
        "final_score": final_score,
        "decision": {"label": decision_label, "meta": decision_meta},
    }


def np_dot_cosine(a, b):
    import numpy as _np
    na = _np.linalg.norm(a)
    nb = _np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(_np.dot(a, b) / (na * nb))



@router.post("/run-full")
async def run_full(cv: UploadFile = File(...), job_json: str = Form(...)):
    """Minimal endpoint: upload a CV file and a job JSON, return pipeline decision.

    This endpoint saves the uploaded CV to a temporary path, runs the existing
    `run_pipeline` function by providing extracted raw text, and returns the same
    output shape as `/run`.
    """
    try:
        job = json.loads(job_json)
    except Exception:
        raise HTTPException(status_code=400, detail="'job_json' must be valid JSON")

    if not isinstance(job, dict):
        raise HTTPException(status_code=400, detail="'job_json' must decode to an object")

    from uuid import uuid4
    tmp_path = Path('/tmp')
    tmp_path.mkdir(parents=True, exist_ok=True)
    suffix = Path(cv.filename).suffix or '.pdf'
    dest = tmp_path / f"uploaded_cv_{uuid4().hex}{suffix}"
    content = await cv.read()
    try:
        dest.write_bytes(content)
    except Exception as exc:
        raise HTTPException(status_code=500, detail=f"Failed to save uploaded file: {exc}")

    extractor = CVExtractionService()
    try:
        extraction = extractor.extract_from_pdf(str(dest))
    except Exception:
        # fallback: try text extraction
        text = content.decode('utf-8', errors='ignore')
        extraction = extractor.extract_from_text(text)

    payload = {"candidate": {"raw_text": extraction.raw_text or ""}, "job": job}
    decision_payload = run_pipeline(payload)

    matcher = MatchingService()
    job_text = job.get("job_text") or job.get("description") or ""
    top_k = int(job.get("top_k", 5))
    top_k_results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k)

    return {
        "decision": decision_payload,
        "top_k": {
            "job_text": job_text,
            "top_k": top_k,
            "results": top_k_results,
        },
    }


@router.post("/top-k")
def top_k_candidates(payload: Dict[str, Any]):
    """Return the top-K CV candidates for a job description using FAISS.

    Expected payload:
        {
            "job_text": "...",
            "top_k": 5,
            "index_dir": "models/faiss_index"
        }
    """
    job_text = payload.get("job_text") or payload.get("description") or ""
    top_k = int(payload.get("top_k", 5))
    index_dir = payload.get("index_dir", "models/faiss_index")

    if not job_text.strip():
        raise HTTPException(status_code=400, detail="'job_text' is required")

    matcher = MatchingService()
    results = matcher.search_top_k_candidates(job_text=job_text, top_k=top_k, index_dir=index_dir)

    return {
        "job_text": job_text,
        "top_k": top_k,
        "index_dir": index_dir,
        "results": results,
    }