"""Matching service: continuous similarity, vector similarity, and deep-learning matcher. Provides a small, dependency-tolerant wrapper around semantic models and vector-based fallbacks so the API can offer `mode=semantic|vector|continuous|deep`. """ from __future__ import annotations import logging from pathlib import Path from typing import List, Dict, Any, Optional import numpy as np import joblib logger = logging.getLogger(__name__) def _cosine(a: np.ndarray, b: np.ndarray) -> float: na = np.linalg.norm(a) nb = np.linalg.norm(b) if na == 0 or nb == 0: return 0.0 return float(np.dot(a, b) / (na * nb)) class MatchingService: """High-level matching utilities. Attempts to use `sentence_transformers` when available and falls back to TF-IDF vector similarity otherwise. """ def __init__(self, model_name: str = "all-MiniLM-L6-v2"): self.model_name = model_name self._embedder = None self._faiss_index = None self._faiss_files: List[str] = [] try: from sentence_transformers import SentenceTransformer self._embedder = SentenceTransformer(model_name) logger.info("Loaded SentenceTransformer: %s", model_name) except Exception: logger.warning("SentenceTransformer unavailable; semantic mode will fall back to TF-IDF.") self._embedder = None def _load_faiss_store(self, index_dir: str = "models/faiss_index") -> bool: """Load a FAISS index built by `backend/scripts/build_bert_faiss.py`.""" index_path = Path(index_dir) / "faiss.index" mapping_path = Path(index_dir) / "mapping.joblib" if not index_path.exists() or not mapping_path.exists(): return False try: import faiss except Exception: logger.warning("FAISS unavailable; top-k search disabled") return False try: mapping = joblib.load(mapping_path) self._faiss_files = list(mapping.get("files", [])) if isinstance(mapping, dict) else [] self._faiss_index = faiss.read_index(str(index_path)) return True except Exception as exc: logger.warning("Unable to load FAISS store from %s: %s", index_dir, exc) self._faiss_index = None self._faiss_files = [] return False def embed(self, texts: List[str]) -> np.ndarray: if not texts: return np.zeros((0, 0)) if self._embedder is not None: return np.asarray(self._embedder.encode(texts, show_progress_bar=False, convert_to_numpy=True)) # fallback: simple TF-IDF dense vectors via sklearn try: from sklearn.feature_extraction.text import TfidfVectorizer vect = TfidfVectorizer(max_features=4096) mat = vect.fit_transform(texts) return mat.toarray() except Exception: return np.zeros((len(texts), 1)) def vector_similarity(self, a: np.ndarray, b: np.ndarray) -> float: return _cosine(a, b) def semantic_similarity(self, left_text: str, right_text: str) -> float: emb = self.embed([left_text, right_text]) if emb.shape[0] < 2: return 0.0 return float(_cosine(emb[0], emb[1])) def continuous_similarity(self, job_text: str, candidate_text: str, window: int = 64) -> Dict[str, Any]: """Compute continuous similarity by comparing job embedding with sliding window embeddings of the candidate document. Returns summary stats. """ # Split candidate into sentences for robustness import re sents = [s.strip() for s in re.split(r"(?<=[.!?\n])\s+", candidate_text) if s.strip()] if not sents: return {"mean": 0.0, "max": 0.0, "top_k": []} # Build embeddings in batches chunks = [] for i in range(0, len(sents), window): chunks.append(" ".join(sents[i : i + window])) emb_job = self.embed([job_text])[0] emb_chunks = self.embed(chunks) scores = [float(_cosine(emb_job, e)) for e in emb_chunks] ranked = sorted(list(enumerate(scores)), key=lambda x: x[1], reverse=True) top_k = [(chunks[idx], float(score)) for idx, score in ranked[:3]] return {"mean": float(np.mean(scores)), "max": float(np.max(scores)), "top_k": top_k} def deep_match_score(self, job_text: str, candidate_text: str) -> float: """Alias for semantic_similarity (keeps API naming clear for 'deep' mode).""" return self.semantic_similarity(job_text, candidate_text) def search_top_k_candidates(self, job_text: str, top_k: int = 5, index_dir: str = "models/faiss_index") -> List[Dict[str, Any]]: """Return the top-K candidate files for a job description using FAISS.""" if not job_text.strip(): return [] if self._embedder is None: logger.warning("SentenceTransformer unavailable; top-k search not available") return [] if self._faiss_index is None and not self._load_faiss_store(index_dir=index_dir): return [] try: import faiss except Exception: return [] query_emb = np.asarray(self._embedder.encode([job_text], show_progress_bar=False, convert_to_numpy=True)) faiss.normalize_L2(query_emb) distances, indices = self._faiss_index.search(query_emb, top_k) results: List[Dict[str, Any]] = [] for rank, (idx, score) in enumerate(zip(indices[0], distances[0]), start=1): if idx < 0: continue file_path = self._faiss_files[idx] if idx < len(self._faiss_files) else None results.append({ "rank": rank, "index": int(idx), "score": float(score), "file": file_path, }) return results __all__ = ["MatchingService"]