File size: 5,918 Bytes
9df97a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | """
Skill extraction from CV text
Main component of NLP pipeline
"""
import re
import json
from pathlib import Path
from typing import List, Dict, Tuple
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
class SkillExtractor:
"""
Extract skills from CV text using:
1. Dictionary matching (exact + fuzzy)
2. Regex patterns for known technologies
3. Context-aware extraction
"""
def __init__(self, skills_dict_path: str = None):
"""
Initialize skill extractor with skills dictionary
Args:
skills_dict_path: Path to skills_dictionary.json
"""
if skills_dict_path is None:
# Default path relative to this file
skills_dict_path = str(Path(__file__).parent.parent / "data" / "skills_dictionary.json")
self.skills_dict = self._load_skills_dictionary(skills_dict_path)
self.all_skills = self._flatten_skills_dict()
self.skill_categories = self._build_category_map()
def _load_skills_dictionary(self, path: str) -> Dict:
"""Load skills dictionary from JSON file"""
try:
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Warning: Skills dictionary not found at {path}")
return {"tech": [], "soft": [], "language": []}
def _flatten_skills_dict(self) -> List[str]:
"""
Flatten the skills dictionary into a single list of all skills
"""
all_skills = []
for category, skills in self.skills_dict.items():
if isinstance(skills, list):
all_skills.extend(skills)
return all_skills
def _build_category_map(self) -> Dict[str, str]:
"""
Build a mapping of skill name -> category for quick lookups
"""
category_map = {}
for category, skills in self.skills_dict.items():
if isinstance(skills, list):
for skill in skills:
category_map[skill.lower()] = category
return category_map
def extract_skills(self, text: str, threshold: int = 80) -> List[Dict]:
"""
Extract skills from text
Args:
text: CV text content
threshold: Fuzzy matching threshold (0-100)
Returns:
List of extracted skills with category and method
[{"name": "Python", "category": "tech", "method": "exact", "confidence": 100}, ...]
"""
if not text:
return []
# Convert to lowercase for matching
text_lower = text.lower()
extracted = []
seen = set() # To avoid duplicates
# 1. Exact matching
for skill in self.all_skills:
skill_lower = skill.lower()
# Check for exact word match (with word boundaries)
pattern = r'\b' + re.escape(skill_lower) + r'\b'
if re.search(pattern, text_lower) and skill_lower not in seen:
extracted.append({
"name": skill,
"category": self.skill_categories.get(skill.lower(), "unknown"),
"method": "exact",
"confidence": 100
})
seen.add(skill_lower)
# 2. Fuzzy matching for variations
words = re.findall(r'\b[\w\-]+\b', text_lower)
unique_words = list(set(words))
for word in unique_words:
if word not in seen and len(word) > 2:
# Find best match in skills dictionary
matches = process.extract(word, self.all_skills, limit=1, scorer=fuzz.token_sort_ratio)
if matches and matches[0][1] >= threshold:
best_match = matches[0][0]
if best_match.lower() not in seen:
extracted.append({
"name": best_match,
"category": self.skill_categories.get(best_match.lower(), "unknown"),
"method": "fuzzy",
"confidence": matches[0][1]
})
seen.add(best_match.lower())
return extracted
def extract_proficiency(self, text: str, skill: str) -> str:
"""
Estimate proficiency level for a skill based on context
Args:
text: CV text
skill: Skill name to assess
Returns:
"expert", "advanced", "intermediate", or "beginner"
"""
text_lower = text.lower()
skill_lower = skill.lower()
# Find context around skill mention (±100 chars)
pattern = r'.{0,100}\b' + re.escape(skill_lower) + r'\b.{0,100}'
matches = re.findall(pattern, text_lower, re.IGNORECASE)
if not matches:
return "beginner"
context = " ".join(matches)
# Keywords for proficiency levels
expert_keywords = ["expert", "lead", "architect", "senior", "principal", "master", "specialized"]
advanced_keywords = ["advanced", "proficient", "strong", "deep", "extensive"]
intermediate_keywords = ["familiar", "experience", "worked", "used", "knowledge"]
# Count keyword matches
context_lower = context.lower()
for keyword in expert_keywords:
if keyword in context_lower:
return "expert"
for keyword in advanced_keywords:
if keyword in context_lower:
return "advanced"
for keyword in intermediate_keywords:
if keyword in context_lower:
return "intermediate"
return "beginner"
|