File size: 5,918 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
"""
Skill extraction from CV text
Main component of NLP pipeline
"""

import re
import json
from pathlib import Path
from typing import List, Dict, Tuple
from fuzzywuzzy import fuzz
from fuzzywuzzy import process


class SkillExtractor:
    """
    Extract skills from CV text using:
    1. Dictionary matching (exact + fuzzy)
    2. Regex patterns for known technologies
    3. Context-aware extraction
    """
    
    def __init__(self, skills_dict_path: str = None):
        """
        Initialize skill extractor with skills dictionary
        
        Args:
            skills_dict_path: Path to skills_dictionary.json
        """
        if skills_dict_path is None:
            # Default path relative to this file
            skills_dict_path = str(Path(__file__).parent.parent / "data" / "skills_dictionary.json")
        
        self.skills_dict = self._load_skills_dictionary(skills_dict_path)
        self.all_skills = self._flatten_skills_dict()
        self.skill_categories = self._build_category_map()
    
    def _load_skills_dictionary(self, path: str) -> Dict:
        """Load skills dictionary from JSON file"""
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except FileNotFoundError:
            print(f"Warning: Skills dictionary not found at {path}")
            return {"tech": [], "soft": [], "language": []}
    
    def _flatten_skills_dict(self) -> List[str]:
        """
        Flatten the skills dictionary into a single list of all skills
        """
        all_skills = []
        for category, skills in self.skills_dict.items():
            if isinstance(skills, list):
                all_skills.extend(skills)
        return all_skills
    
    def _build_category_map(self) -> Dict[str, str]:
        """
        Build a mapping of skill name -> category for quick lookups
        """
        category_map = {}
        for category, skills in self.skills_dict.items():
            if isinstance(skills, list):
                for skill in skills:
                    category_map[skill.lower()] = category
        return category_map
    
    def extract_skills(self, text: str, threshold: int = 80) -> List[Dict]:
        """
        Extract skills from text
        
        Args:
            text: CV text content
            threshold: Fuzzy matching threshold (0-100)
        
        Returns:
            List of extracted skills with category and method
            [{"name": "Python", "category": "tech", "method": "exact", "confidence": 100}, ...]
        """
        if not text:
            return []
        
        # Convert to lowercase for matching
        text_lower = text.lower()
        extracted = []
        seen = set()  # To avoid duplicates
        
        # 1. Exact matching
        for skill in self.all_skills:
            skill_lower = skill.lower()
            
            # Check for exact word match (with word boundaries)
            pattern = r'\b' + re.escape(skill_lower) + r'\b'
            if re.search(pattern, text_lower) and skill_lower not in seen:
                extracted.append({
                    "name": skill,
                    "category": self.skill_categories.get(skill.lower(), "unknown"),
                    "method": "exact",
                    "confidence": 100
                })
                seen.add(skill_lower)
        
        # 2. Fuzzy matching for variations
        words = re.findall(r'\b[\w\-]+\b', text_lower)
        unique_words = list(set(words))
        
        for word in unique_words:
            if word not in seen and len(word) > 2:
                # Find best match in skills dictionary
                matches = process.extract(word, self.all_skills, limit=1, scorer=fuzz.token_sort_ratio)
                
                if matches and matches[0][1] >= threshold:
                    best_match = matches[0][0]
                    if best_match.lower() not in seen:
                        extracted.append({
                            "name": best_match,
                            "category": self.skill_categories.get(best_match.lower(), "unknown"),
                            "method": "fuzzy",
                            "confidence": matches[0][1]
                        })
                        seen.add(best_match.lower())
        
        return extracted
    
    def extract_proficiency(self, text: str, skill: str) -> str:
        """
        Estimate proficiency level for a skill based on context
        
        Args:
            text: CV text
            skill: Skill name to assess
        
        Returns:
            "expert", "advanced", "intermediate", or "beginner"
        """
        text_lower = text.lower()
        skill_lower = skill.lower()
        
        # Find context around skill mention (±100 chars)
        pattern = r'.{0,100}\b' + re.escape(skill_lower) + r'\b.{0,100}'
        matches = re.findall(pattern, text_lower, re.IGNORECASE)
        
        if not matches:
            return "beginner"
        
        context = " ".join(matches)
        
        # Keywords for proficiency levels
        expert_keywords = ["expert", "lead", "architect", "senior", "principal", "master", "specialized"]
        advanced_keywords = ["advanced", "proficient", "strong", "deep", "extensive"]
        intermediate_keywords = ["familiar", "experience", "worked", "used", "knowledge"]
        
        # Count keyword matches
        context_lower = context.lower()
        for keyword in expert_keywords:
            if keyword in context_lower:
                return "expert"
        
        for keyword in advanced_keywords:
            if keyword in context_lower:
                return "advanced"
        
        for keyword in intermediate_keywords:
            if keyword in context_lower:
                return "intermediate"
        
        return "beginner"