Spaces:
Sleeping
Sleeping
File size: 4,137 Bytes
9df97a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
Smart Skill Deduplication — Semantic Clustering
Removes duplicate skills using semantic similarity instead of string matching.
Example: ["Python", "python", "Python 3"] → ["Python"]
"""
from typing import List
from collections import defaultdict
import logging
from app.services.normalization import normalize_skill_name
logger = logging.getLogger(__name__)
class SmartSkillDeduplicator:
"""Deduplicates skills using semantic clustering."""
def __init__(self, embedder=None, similarity_threshold: float = 0.82):
"""
Args:
embedder: SentenceTransformer instance (optional)
similarity_threshold: Min similarity to merge skills (0.0-1.0)
"""
self.embedder = embedder
self.similarity_threshold = similarity_threshold
def deduplicate(self, skills: List[str]) -> List[str]:
"""
Deduplicate a list of skills via semantic clustering.
Args:
skills: ["Python", "python", "ML", "Machine Learning"]
Returns:
["Python", "Machine Learning"] # Canonical names
"""
if not skills:
return []
if len(skills) <= 1:
return skills
# Normalize: lowercase, trim, remove empty
normalized = [s.strip().lower() for s in skills if s and s.strip()]
# First pass: exact string deduplication (preserve order)
first_pass = list(dict.fromkeys(normalized))
if len(first_pass) <= 1:
return [normalize_skill_name(skill) for skill in first_pass]
# Second pass: semantic clustering (if embedder available)
if self.embedder:
try:
clusters = self._cluster_by_similarity(first_pass)
canonical = self._extract_canonical(skills, clusters)
return canonical
except Exception as e:
logger.warning(f"Embedding failed ({e}), using string dedup")
return [normalize_skill_name(skill) for skill in first_pass]
return [normalize_skill_name(skill) for skill in first_pass]
def _cluster_by_similarity(self, skills: List[str]) -> List[List[int]]:
"""Cluster skills by semantic similarity."""
try:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
except ImportError:
logger.warning("sklearn not available, skipping semantic clustering")
return [[i] for i in range(len(skills))]
# Generate embeddings
embeddings = self.embedder.encode(skills) # shape: (N, dim)
# Compute similarity matrix
similarity_matrix = cosine_similarity(embeddings)
# Clustering via connected components
clusters = []
used = set()
for i in range(len(skills)):
if i in used:
continue
# Start new cluster
cluster = [i]
used.add(i)
# Find all similar skills
for j in range(i + 1, len(skills)):
if j in used:
continue
if similarity_matrix[i][j] > self.similarity_threshold:
cluster.append(j)
used.add(j)
clusters.append(cluster)
return clusters
def _extract_canonical(self, original_skills: List[str],
clusters: List[List[int]]) -> List[str]:
"""
Extract canonical skill for each cluster.
Heuristic: longest skill = most descriptive
"""
canonical = []
for cluster in clusters:
# Get original skills (preserving case/format)
cluster_skills = [original_skills[i] for i in cluster]
# Heuristic: longest = most descriptive
canonical_skill = max(cluster_skills, key=len)
canonical.append(canonical_skill)
return canonical
|