"""Hybrid matcher combining semantic, BERT classifier, skill cosine, and business signals. Weight breakdown (defaults from training config): semantic 0.35 — sentence-transformer cosine similarity on full texts cross_encoder 0.20 — deeper semantic re-ranking (falls back to semantic when unavailable) bert_classifier 0.25 — fine-tuned camembert compatibility classifier skill_cosine 0.12 — binary skill-vector cosine (CosineScorer) business 0.08 — structured rules: experience, location, availability All weights must sum to 1.0; HybridConfig normalizes automatically if they don't. """ from __future__ import annotations import logging from dataclasses import dataclass, field from typing import Dict, List, Optional import numpy as np logger = logging.getLogger(__name__) @dataclass class HybridConfig: weight_semantic: float = 0.35 weight_cross_encoder: float = 0.20 weight_bert_classifier: float = 0.25 weight_skill_cosine: float = 0.12 weight_business: float = 0.08 def __post_init__(self) -> None: total = ( self.weight_semantic + self.weight_cross_encoder + self.weight_bert_classifier + self.weight_skill_cosine + self.weight_business ) if total <= 0: raise ValueError("HybridConfig: all weights are zero.") if abs(total - 1.0) > 1e-6: logger.debug("HybridConfig: weights sum to %.4f — normalizing.", total) self.weight_semantic /= total self.weight_cross_encoder /= total self.weight_bert_classifier /= total self.weight_skill_cosine /= total self.weight_business /= total class HybridMatcher: """Combine multiple matchers into a single weighted score (0–100). Parameters ---------- config: Weight configuration. bert_classifier: Pre-loaded BertClassifierAdapter. If None, the adapter is lazy-loaded from the default model directory (backend/models/bert_matching/). """ def __init__( self, config: Optional[HybridConfig] = None, bert_classifier=None, ) -> None: self.config = config or HybridConfig() self._bert = bert_classifier # may be None; resolved lazily # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def score( self, candidate_text: str, job_text: str, candidate_skills: Optional[List[str]] = None, criteria_skills: Optional[Dict[str, float]] = None, business_signals: Optional[Dict[str, object]] = None, ) -> Dict[str, object]: """Return a hybrid score dict. Parameters ---------- candidate_text: Free-text CV / candidate profile. job_text: Free-text job description / offer. candidate_skills: List of skill names the candidate has. criteria_skills: Dict {skill_name: weight_0_to_100} from recruiter criteria. business_signals: Optional structured signals, e.g.:: { "years_experience": 5, "required_experience": 3, "location_match": True, "available": True, } Returns ------- dict with keys: score (0–100), component_scores, weights_used """ cfg = self.config components: Dict[str, float] = {} # 1. Semantic score components["semantic"] = self._semantic_score(candidate_text, job_text) # 2. Cross-encoder score (fallback to semantic when unavailable) components["cross_encoder"] = self._cross_encoder_score(candidate_text, job_text) # 3. BERT classifier score components["bert_classifier"] = self._bert_score(candidate_text, job_text) # 4. Skill cosine score components["skill_cosine"] = self._skill_cosine_score( candidate_skills or [], criteria_skills or {} ) # 5. Business rules score components["business"] = self._business_score(business_signals or {}) # Weighted sum raw = ( cfg.weight_semantic * components["semantic"] + cfg.weight_cross_encoder * components["cross_encoder"] + cfg.weight_bert_classifier * components["bert_classifier"] + cfg.weight_skill_cosine * components["skill_cosine"] + cfg.weight_business * components["business"] ) final_score = float(np.clip(raw * 100, 0.0, 100.0)) return { "score": final_score, "component_scores": {k: round(v, 4) for k, v in components.items()}, "weights_used": { "semantic": cfg.weight_semantic, "cross_encoder": cfg.weight_cross_encoder, "bert_classifier": cfg.weight_bert_classifier, "skill_cosine": cfg.weight_skill_cosine, "business": cfg.weight_business, }, } # ------------------------------------------------------------------ # Component scorers # ------------------------------------------------------------------ def _semantic_score(self, candidate_text: str, job_text: str) -> float: try: from ai_module.matching.semantic_matcher import SemanticSkillMatcher return SemanticSkillMatcher.semantic_similarity(candidate_text, job_text) except Exception as exc: logger.debug("Semantic scorer unavailable: %s", exc) return 0.0 def _cross_encoder_score(self, candidate_text: str, job_text: str) -> float: """Attempt a cross-encoder pass; fall back to semantic similarity.""" try: from sentence_transformers import CrossEncoder if not hasattr(self, "_cross_encoder_model"): self._cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") score = self._cross_encoder_model.predict([[candidate_text, job_text]])[0] # ms-marco scores are logits; apply sigmoid import math return float(np.clip(1 / (1 + math.exp(-score)), 0.0, 1.0)) except Exception: # Graceful fallback to standard semantic similarity return self._semantic_score(candidate_text, job_text) def _bert_score(self, candidate_text: str, job_text: str) -> float: bert = self._get_bert() if bert is None: return 0.0 return bert.predict_score(candidate_text, job_text) def _skill_cosine_score( self, candidate_skills: List[str], criteria_skills: Dict[str, float], ) -> float: if not candidate_skills or not criteria_skills: return 0.0 try: from ai_module.matching.scorer import CosineScorer all_skills = list(criteria_skills.keys()) result = CosineScorer.calculate_match_score( candidate_skills, criteria_skills, all_skills ) return float(result["score"]) / 100.0 except Exception as exc: logger.debug("Skill cosine scorer failed: %s", exc) return 0.0 def _business_score(self, signals: Dict[str, object]) -> float: """Simple rules-based business score in [0, 1].""" if not signals: return 0.5 # neutral when no signals provided score = 0.0 count = 0 # Experience years_exp = signals.get("years_experience") required_exp = signals.get("required_experience") if years_exp is not None and required_exp is not None: try: ratio = float(years_exp) / max(float(required_exp), 1.0) score += float(np.clip(ratio, 0.0, 1.0)) except (TypeError, ValueError): score += 0.5 count += 1 # Location match location_match = signals.get("location_match") if location_match is not None: score += 1.0 if location_match else 0.2 count += 1 # Availability available = signals.get("available") if available is not None: score += 1.0 if available else 0.0 count += 1 return float(np.clip(score / max(count, 1), 0.0, 1.0)) # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ def _get_bert(self): if self._bert is not None: return self._bert try: from ai_module.matching.bert_classifier_adapter import get_default_adapter self._bert = get_default_adapter() except Exception as exc: logger.warning("Could not load BertClassifierAdapter: %s", exc) self._bert = None return self._bert