Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / ai_module /matching /semantic_matcher.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 10 days ago

8.4 kB

	"""Semantic skill matching using modern open-source embeddings + optional FAISS.

	Default model:
	- BAAI/bge-small-en

	Key capabilities:
	- text embedding cache
	- cosine similarity
	- optional FAISS inner-product index for fast nearest-neighbor lookup
	"""

	from __future__ import annotations

	import os
	from typing import Dict, List, Optional, Tuple

	import numpy as np

	try:
	from sentence_transformers import SentenceTransformer

	SENTENCE_TRANSFORMERS_AVAILABLE = True
	except ImportError:
	SENTENCE_TRANSFORMERS_AVAILABLE = False
	print("Warning: sentence-transformers not installed. Run: pip install sentence-transformers")

	try:
	import faiss # type: ignore

	FAISS_AVAILABLE = True
	except ImportError:
	FAISS_AVAILABLE = False


	class SemanticSkillMatcher:
	"""Match candidate skills with weighted criteria using semantic embeddings."""

	MODEL_NAME = os.getenv("SEMANTIC_EMBEDDING_MODEL", "BAAI/bge-small-en")
	DEFAULT_THRESHOLD = float(os.getenv("SEMANTIC_MATCH_THRESHOLD", "0.60"))

	_model = None
	_embedding_cache: Dict[str, np.ndarray] = {}

	@classmethod
	def _load_model(cls) -> Optional["SentenceTransformer"]:
	"""Load and cache the sentence-transformers model once."""
	if cls._model is not None:
	return cls._model

	if not SENTENCE_TRANSFORMERS_AVAILABLE:
	print("sentence-transformers not available")
	return None

	try:
	print(f"Loading {cls.MODEL_NAME}...")
	cls._model = SentenceTransformer(cls.MODEL_NAME)
	print(f"✓ Model loaded successfully. Embedding dimension: {cls._model.get_sentence_embedding_dimension()}")
	return cls._model
	except Exception as e:
	print(f"Error loading model: {e}")
	return None

	@staticmethod
	def _normalize(vecs: np.ndarray) -> np.ndarray:
	"""L2 normalize vectors for cosine similarity via dot product."""
	norms = np.linalg.norm(vecs, axis=1, keepdims=True)
	norms = np.where(norms == 0, 1.0, norms)
	return vecs / norms

	@classmethod
	def get_embedding(cls, text: str) -> Optional[np.ndarray]:
	"""Get one normalized embedding and cache it."""
	key = text.strip().lower()
	if key in cls._embedding_cache:
	return cls._embedding_cache[key]

	model = cls._load_model()
	if model is None:
	return None

	try:
	embedding = model.encode([text], convert_to_numpy=True).astype(np.float32)
	embedding = cls._normalize(embedding)[0]
	cls._embedding_cache[key] = embedding
	return embedding
	except Exception as e:
	print(f"Error embedding text '{text}': {e}")
	return None

	@classmethod
	def get_embeddings_batch(cls, texts: List[str]) -> Optional[np.ndarray]:
	"""Get normalized embeddings for multiple texts and cache each item."""
	model = cls._load_model()
	if model is None:
	return None

	try:
	embeddings = model.encode(texts, convert_to_numpy=True).astype(np.float32)
	embeddings = cls._normalize(embeddings)

	for text, embedding in zip(texts, embeddings):
	cls._embedding_cache[text.strip().lower()] = embedding

	return embeddings
	except Exception as e:
	print(f"Error getting batch embeddings: {e}")
	return None

	@classmethod
	def semantic_similarity(cls, text1: str, text2: str) -> float:
	"""Cosine similarity in [0, 1] between two texts."""
	embed1 = cls.get_embedding(text1)
	embed2 = cls.get_embedding(text2)

	if embed1 is None or embed2 is None:
	return 0.0

	similarity = float(np.dot(embed1, embed2))
	return float(np.clip(similarity, 0.0, 1.0))

	@classmethod
	def build_faiss_index(cls, corpus: List[str]) -> Optional[Tuple["faiss.IndexFlatIP", List[str]]]:
	"""Build a FAISS inner-product index for a corpus of phrases."""
	if not FAISS_AVAILABLE:
	return None

	cleaned = [item.strip() for item in corpus if item and item.strip()]
	if not cleaned:
	return None

	embeddings = cls.get_embeddings_batch(cleaned)
	if embeddings is None:
	return None

	index = faiss.IndexFlatIP(embeddings.shape[1])
	index.add(embeddings.astype(np.float32))
	return index, cleaned

	@classmethod
	def search_similar(cls, query: str, corpus: List[str], top_k: int = 5) -> List[Tuple[str, float]]:
	"""Return top-k most similar corpus entries for query."""
	if not corpus:
	return []

	top_k = max(1, min(top_k, len(corpus)))
	q = cls.get_embedding(query)
	if q is None:
	return []

	# Use FAISS when available.
	index_bundle = cls.build_faiss_index(corpus)
	if index_bundle is not None:
	index, cleaned = index_bundle
	scores, idxs = index.search(np.expand_dims(q.astype(np.float32), axis=0), top_k)
	return [
	(cleaned[int(i)], float(np.clip(scores[0][rank], 0.0, 1.0)))
	for rank, i in enumerate(idxs[0])
	if int(i) >= 0
	]

	# Fallback brute force.
	similarities = [(item, cls.semantic_similarity(query, item)) for item in corpus]
	similarities.sort(key=lambda pair: pair[1], reverse=True)
	return similarities[:top_k]

	@classmethod
	def match_candidate_skills(
	cls,
	candidate_skills: List[str],
	criteria_skills: List[Dict[str, object]],
	threshold: float = DEFAULT_THRESHOLD,
	) -> Dict[str, object]:
	"""Match candidate skills to weighted criteria with semantic nearest-neighbor."""
	if not candidate_skills or not criteria_skills:
	return {
	"matched_skills": [],
	"score": 0.0,
	"details": "No skills to match",
	}

	candidate_skills_clean = [s.strip() for s in candidate_skills if s and s.strip()]
	if not candidate_skills_clean:
	return {
	"matched_skills": [],
	"score": 0.0,
	"details": "No candidate skills available",
	}

	matched_skills: List[Dict[str, object]] = []
	total_weight = 0
	total_matched_weight = 0

	for criteria in criteria_skills:
	criteria_name = str(criteria.get("name", "")).strip()
	criteria_weight = int(criteria.get("weight", 50) or 50)

	if not criteria_name:
	continue

	total_weight += criteria_weight

	nearest = cls.search_similar(criteria_name, candidate_skills_clean, top_k=1)
	if not nearest:
	continue

	best_match, best_similarity = nearest[0]
	if best_similarity >= threshold:
	total_matched_weight += criteria_weight
	matched_skills.append({
	"criteria_skill": criteria_name.lower(),
	"matched_skill": best_match,
	"similarity": float(best_similarity),
	"weight": criteria_weight,
	})

	overall_score = (total_matched_weight / total_weight * 100) if total_weight > 0 else 0.0

	return {
	"matched_skills": matched_skills,
	"score": float(np.clip(overall_score, 0.0, 100.0)),
	"total_matches": len(matched_skills),
	"total_criteria": len(criteria_skills),
	"details": f"Matched {len(matched_skills)}/{len(criteria_skills)} criteria skills",
	}

	@classmethod
	def clear_cache(cls):
	"""Clear embedding cache."""
	cls._embedding_cache.clear()

	@classmethod
	def get_cache_size(cls) -> int:
	"""Return number of cached embeddings."""
	return len(cls._embedding_cache)


	# Utility function for simple similarity check
	def semantic_skill_match(skill1: str, skill2: str, threshold: float = 0.6) -> Tuple[bool, float]:
	"""Simple helper that returns boolean semantic match + similarity."""
	similarity = SemanticSkillMatcher.semantic_similarity(skill1, skill2)
	is_match = similarity >= threshold
	return is_match, similarity