Spaces:
Sleeping
Sleeping
| """ | |
| Smart Skill Deduplication — Semantic Clustering | |
| Removes duplicate skills using semantic similarity instead of string matching. | |
| Example: ["Python", "python", "Python 3"] → ["Python"] | |
| """ | |
| from typing import List | |
| from collections import defaultdict | |
| import logging | |
| from app.services.normalization import normalize_skill_name | |
| logger = logging.getLogger(__name__) | |
| class SmartSkillDeduplicator: | |
| """Deduplicates skills using semantic clustering.""" | |
| def __init__(self, embedder=None, similarity_threshold: float = 0.82): | |
| """ | |
| Args: | |
| embedder: SentenceTransformer instance (optional) | |
| similarity_threshold: Min similarity to merge skills (0.0-1.0) | |
| """ | |
| self.embedder = embedder | |
| self.similarity_threshold = similarity_threshold | |
| def deduplicate(self, skills: List[str]) -> List[str]: | |
| """ | |
| Deduplicate a list of skills via semantic clustering. | |
| Args: | |
| skills: ["Python", "python", "ML", "Machine Learning"] | |
| Returns: | |
| ["Python", "Machine Learning"] # Canonical names | |
| """ | |
| if not skills: | |
| return [] | |
| if len(skills) <= 1: | |
| return skills | |
| # Normalize: lowercase, trim, remove empty | |
| normalized = [s.strip().lower() for s in skills if s and s.strip()] | |
| # First pass: exact string deduplication (preserve order) | |
| first_pass = list(dict.fromkeys(normalized)) | |
| if len(first_pass) <= 1: | |
| return [normalize_skill_name(skill) for skill in first_pass] | |
| # Second pass: semantic clustering (if embedder available) | |
| if self.embedder: | |
| try: | |
| clusters = self._cluster_by_similarity(first_pass) | |
| canonical = self._extract_canonical(skills, clusters) | |
| return canonical | |
| except Exception as e: | |
| logger.warning(f"Embedding failed ({e}), using string dedup") | |
| return [normalize_skill_name(skill) for skill in first_pass] | |
| return [normalize_skill_name(skill) for skill in first_pass] | |
| def _cluster_by_similarity(self, skills: List[str]) -> List[List[int]]: | |
| """Cluster skills by semantic similarity.""" | |
| try: | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| except ImportError: | |
| logger.warning("sklearn not available, skipping semantic clustering") | |
| return [[i] for i in range(len(skills))] | |
| # Generate embeddings | |
| embeddings = self.embedder.encode(skills) # shape: (N, dim) | |
| # Compute similarity matrix | |
| similarity_matrix = cosine_similarity(embeddings) | |
| # Clustering via connected components | |
| clusters = [] | |
| used = set() | |
| for i in range(len(skills)): | |
| if i in used: | |
| continue | |
| # Start new cluster | |
| cluster = [i] | |
| used.add(i) | |
| # Find all similar skills | |
| for j in range(i + 1, len(skills)): | |
| if j in used: | |
| continue | |
| if similarity_matrix[i][j] > self.similarity_threshold: | |
| cluster.append(j) | |
| used.add(j) | |
| clusters.append(cluster) | |
| return clusters | |
| def _extract_canonical(self, original_skills: List[str], | |
| clusters: List[List[int]]) -> List[str]: | |
| """ | |
| Extract canonical skill for each cluster. | |
| Heuristic: longest skill = most descriptive | |
| """ | |
| canonical = [] | |
| for cluster in clusters: | |
| # Get original skills (preserving case/format) | |
| cluster_skills = [original_skills[i] for i in cluster] | |
| # Heuristic: longest = most descriptive | |
| canonical_skill = max(cluster_skills, key=len) | |
| canonical.append(canonical_skill) | |
| return canonical | |