File size: 9,105 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
"""Hybrid matcher combining semantic, BERT classifier, skill cosine, and business signals.

Weight breakdown (defaults from training config):
  semantic          0.35  β€” sentence-transformer cosine similarity on full texts
  cross_encoder     0.20  β€” deeper semantic re-ranking (falls back to semantic when unavailable)
  bert_classifier   0.25  β€” fine-tuned camembert compatibility classifier
  skill_cosine      0.12  β€” binary skill-vector cosine (CosineScorer)
  business          0.08  β€” structured rules: experience, location, availability

All weights must sum to 1.0; HybridConfig normalizes automatically if they don't.
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Dict, List, Optional

import numpy as np

logger = logging.getLogger(__name__)


@dataclass
class HybridConfig:
    weight_semantic: float = 0.35
    weight_cross_encoder: float = 0.20
    weight_bert_classifier: float = 0.25
    weight_skill_cosine: float = 0.12
    weight_business: float = 0.08

    def __post_init__(self) -> None:
        total = (
            self.weight_semantic
            + self.weight_cross_encoder
            + self.weight_bert_classifier
            + self.weight_skill_cosine
            + self.weight_business
        )
        if total <= 0:
            raise ValueError("HybridConfig: all weights are zero.")
        if abs(total - 1.0) > 1e-6:
            logger.debug("HybridConfig: weights sum to %.4f β€” normalizing.", total)
            self.weight_semantic /= total
            self.weight_cross_encoder /= total
            self.weight_bert_classifier /= total
            self.weight_skill_cosine /= total
            self.weight_business /= total


class HybridMatcher:
    """Combine multiple matchers into a single weighted score (0–100).

    Parameters
    ----------
    config:
        Weight configuration.
    bert_classifier:
        Pre-loaded BertClassifierAdapter. If None, the adapter is lazy-loaded
        from the default model directory (backend/models/bert_matching/).
    """

    def __init__(
        self,
        config: Optional[HybridConfig] = None,
        bert_classifier=None,
    ) -> None:
        self.config = config or HybridConfig()
        self._bert = bert_classifier  # may be None; resolved lazily

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def score(
        self,
        candidate_text: str,
        job_text: str,
        candidate_skills: Optional[List[str]] = None,
        criteria_skills: Optional[Dict[str, float]] = None,
        business_signals: Optional[Dict[str, object]] = None,
    ) -> Dict[str, object]:
        """Return a hybrid score dict.

        Parameters
        ----------
        candidate_text:
            Free-text CV / candidate profile.
        job_text:
            Free-text job description / offer.
        candidate_skills:
            List of skill names the candidate has.
        criteria_skills:
            Dict {skill_name: weight_0_to_100} from recruiter criteria.
        business_signals:
            Optional structured signals, e.g.::

                {
                    "years_experience": 5,
                    "required_experience": 3,
                    "location_match": True,
                    "available": True,
                }

        Returns
        -------
        dict with keys: score (0–100), component_scores, weights_used
        """
        cfg = self.config
        components: Dict[str, float] = {}

        # 1. Semantic score
        components["semantic"] = self._semantic_score(candidate_text, job_text)

        # 2. Cross-encoder score (fallback to semantic when unavailable)
        components["cross_encoder"] = self._cross_encoder_score(candidate_text, job_text)

        # 3. BERT classifier score
        components["bert_classifier"] = self._bert_score(candidate_text, job_text)

        # 4. Skill cosine score
        components["skill_cosine"] = self._skill_cosine_score(
            candidate_skills or [], criteria_skills or {}
        )

        # 5. Business rules score
        components["business"] = self._business_score(business_signals or {})

        # Weighted sum
        raw = (
            cfg.weight_semantic * components["semantic"]
            + cfg.weight_cross_encoder * components["cross_encoder"]
            + cfg.weight_bert_classifier * components["bert_classifier"]
            + cfg.weight_skill_cosine * components["skill_cosine"]
            + cfg.weight_business * components["business"]
        )
        final_score = float(np.clip(raw * 100, 0.0, 100.0))

        return {
            "score": final_score,
            "component_scores": {k: round(v, 4) for k, v in components.items()},
            "weights_used": {
                "semantic": cfg.weight_semantic,
                "cross_encoder": cfg.weight_cross_encoder,
                "bert_classifier": cfg.weight_bert_classifier,
                "skill_cosine": cfg.weight_skill_cosine,
                "business": cfg.weight_business,
            },
        }

    # ------------------------------------------------------------------
    # Component scorers
    # ------------------------------------------------------------------

    def _semantic_score(self, candidate_text: str, job_text: str) -> float:
        try:
            from ai_module.matching.semantic_matcher import SemanticSkillMatcher

            return SemanticSkillMatcher.semantic_similarity(candidate_text, job_text)
        except Exception as exc:
            logger.debug("Semantic scorer unavailable: %s", exc)
            return 0.0

    def _cross_encoder_score(self, candidate_text: str, job_text: str) -> float:
        """Attempt a cross-encoder pass; fall back to semantic similarity."""
        try:
            from sentence_transformers import CrossEncoder

            if not hasattr(self, "_cross_encoder_model"):
                self._cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
            score = self._cross_encoder_model.predict([[candidate_text, job_text]])[0]
            # ms-marco scores are logits; apply sigmoid
            import math

            return float(np.clip(1 / (1 + math.exp(-score)), 0.0, 1.0))
        except Exception:
            # Graceful fallback to standard semantic similarity
            return self._semantic_score(candidate_text, job_text)

    def _bert_score(self, candidate_text: str, job_text: str) -> float:
        bert = self._get_bert()
        if bert is None:
            return 0.0
        return bert.predict_score(candidate_text, job_text)

    def _skill_cosine_score(
        self,
        candidate_skills: List[str],
        criteria_skills: Dict[str, float],
    ) -> float:
        if not candidate_skills or not criteria_skills:
            return 0.0
        try:
            from ai_module.matching.scorer import CosineScorer

            all_skills = list(criteria_skills.keys())
            result = CosineScorer.calculate_match_score(
                candidate_skills, criteria_skills, all_skills
            )
            return float(result["score"]) / 100.0
        except Exception as exc:
            logger.debug("Skill cosine scorer failed: %s", exc)
            return 0.0

    def _business_score(self, signals: Dict[str, object]) -> float:
        """Simple rules-based business score in [0, 1]."""
        if not signals:
            return 0.5  # neutral when no signals provided

        score = 0.0
        count = 0

        # Experience
        years_exp = signals.get("years_experience")
        required_exp = signals.get("required_experience")
        if years_exp is not None and required_exp is not None:
            try:
                ratio = float(years_exp) / max(float(required_exp), 1.0)
                score += float(np.clip(ratio, 0.0, 1.0))
            except (TypeError, ValueError):
                score += 0.5
            count += 1

        # Location match
        location_match = signals.get("location_match")
        if location_match is not None:
            score += 1.0 if location_match else 0.2
            count += 1

        # Availability
        available = signals.get("available")
        if available is not None:
            score += 1.0 if available else 0.0
            count += 1

        return float(np.clip(score / max(count, 1), 0.0, 1.0))

    # ------------------------------------------------------------------
    # Internal helpers
    # ------------------------------------------------------------------

    def _get_bert(self):
        if self._bert is not None:
            return self._bert
        try:
            from ai_module.matching.bert_classifier_adapter import get_default_adapter

            self._bert = get_default_adapter()
        except Exception as exc:
            logger.warning("Could not load BertClassifierAdapter: %s", exc)
            self._bert = None
        return self._bert