File size: 9,789 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
"""
Enhanced Skill Extractor - Combines NER + Dictionary Fuzzy Matching
Étape 6 Optimization: Hybrid approach for maximum skill coverage
"""

from typing import List, Dict, Optional
import json
import os
import re

try:
    from transformers import pipeline
    NER_AVAILABLE = True
except ImportError:
    NER_AVAILABLE = False

from ai_module.nlp.skill_extractor import SkillExtractor
from ai_module.matching.semantic_matcher import SemanticSkillMatcher

try:
    from ai_module.nlp.multilingual_skill_extractor import MultilingualSkillExtractor
    MULTILINGUAL_EXTRACTOR_AVAILABLE = True
except Exception:
    MultilingualSkillExtractor = None
    MULTILINGUAL_EXTRACTOR_AVAILABLE = False


class EnhancedSkillExtractor:
    """
    Extract skills using hybrid approach:
    1. BERT-based NER (95% accuracy)
    2. Dictionary fuzzy matching (80% accuracy)
    3. Intelligent merge with confidence scores
    """
    
    def __init__(self, load_ner: bool = True):
        """Initialize both extraction methods"""
        self.skill_extractor = SkillExtractor()
        self.canonical_skills = list(self.skill_extractor.all_skills)
        self.semantic_threshold = float(os.getenv("SKILL_NORMALIZATION_THRESHOLD", "0.62"))
        self.ner_model_name = os.getenv("HF_SKILL_NER_MODEL", "dslim/bert-base-NER")
        self.multilingual_extractor = MultilingualSkillExtractor() if MULTILINGUAL_EXTRACTOR_AVAILABLE else None
        
        if load_ner and NER_AVAILABLE:
            try:
                self.ner_pipeline = pipeline(
                    "ner",
                    model=self.ner_model_name,
                    aggregation_strategy="simple"
                )
                self.ner_available = True
            except Exception as e:
                print(f"⚠️ NER model not available: {e}. Using dictionary-only extraction.")
                self.ner_available = False
        else:
            self.ner_available = False
    
    def extract_skills_hybrid(self, text: str, threshold: int = 80) -> List[Dict]:
        """
        Extract skills using both NER and dictionary methods
        
        Args:
            text: CV text
            threshold: Fuzzy matching threshold
        
        Returns:
            List of skills with source and confidence
            [
                {"name": "Python", "source": "NER", "confidence": 0.95},
                {"name": "React", "source": "DICT-FUZZY", "confidence": 0.80}
            ]
        """
        all_skills = []
        seen_skills = set()  # Track added skills (lowercase)
        
        # Step 1: Extract via NER (if available)
        if self.ner_available:
            ner_skills = self._extract_via_ner(text)
            for skill_data in ner_skills:
                skill_name_lower = skill_data["name"].lower()
                if skill_name_lower not in seen_skills:
                    all_skills.append(skill_data)
                    seen_skills.add(skill_name_lower)
        
        # Step 2: Extract via Dictionary (for coverage)
        dict_skills = self.skill_extractor.extract_skills(text, threshold=threshold)
        for dict_skill in dict_skills:
            skill_name_lower = dict_skill["name"].lower()
            if skill_name_lower not in seen_skills:
                # Enhance dictionary skill with additional metadata
                skill_data = {
                    "name": dict_skill["name"],
                    "source": "DICT-FUZZY",
                    "confidence": 0.80,  # Lower than NER
                    "category": dict_skill.get("category", "tech"),
                    "method": dict_skill.get("method", "fuzzy")
                }
                all_skills.append(skill_data)
                seen_skills.add(skill_name_lower)

        # Step 2b: Multilingual aliases (FR/EN/ES) for better recall.
        if self.multilingual_extractor is not None:
            multilingual_skills = self.multilingual_extractor.extract_skills(text)
            for skill_data in multilingual_skills:
                skill_name_lower = skill_data["name"].lower()
                if skill_name_lower not in seen_skills:
                    all_skills.append(skill_data)
                    seen_skills.add(skill_name_lower)

        # Step 3: Embedding-based normalization to canonical skill list.
        normalized = self._normalize_with_embeddings(all_skills)
        if normalized:
            all_skills = normalized
        
        # Sort by confidence descending
        all_skills.sort(key=lambda x: x.get("confidence", 0), reverse=True)
        
        return all_skills

    def _normalize_with_embeddings(self, skills: List[Dict]) -> List[Dict]:
        """Map extracted skill variants to nearest canonical skills via embeddings."""
        if not skills:
            return []

        normalized: List[Dict] = []
        seen = set()

        for skill in skills:
            raw_name = str(skill.get("name", "")).strip()
            if not raw_name:
                continue

            nearest = SemanticSkillMatcher.search_similar(raw_name, self.canonical_skills, top_k=1)
            if nearest:
                best_name, similarity = nearest[0]
                if similarity >= self.semantic_threshold:
                    skill["normalized_name"] = best_name
                    skill["normalization_similarity"] = round(similarity, 4)
                    skill["name"] = best_name

            key = str(skill.get("name", "")).lower()
            if key in seen:
                continue
            seen.add(key)
            normalized.append(skill)

        return normalized
    
    def _extract_via_ner(self, text: str) -> List[Dict]:
        """Extract candidate skill entities from NER pipeline output."""
        if not self.ner_available or not text:
            return []
        
        try:
            # Keep runtime bounded.
            text_truncated = text[:2000]
            
            ner_results = self.ner_pipeline(text_truncated)
            
            ner_skills = []
            for entity in ner_results:
                group = str(entity.get("entity_group", "")).upper()
                if group not in {"MISC", "ORG", "SKILL"}:
                    continue
                if entity.get("score", 0) <= 0.70:
                    continue

                skill_name = str(entity.get("word", "")).strip().replace("##", "")
                skill_name = re.sub(r"\s+", " ", skill_name)
                if len(skill_name) < 2:
                    continue

                ner_skills.append({
                    "name": skill_name.title(),
                    "source": "NER",
                    "confidence": float(entity.get("score", 0.95)),
                    "category": self._classify_skill(skill_name),
                    "method": f"NER-{self.ner_model_name}"
                })
            
            return ner_skills
        except Exception as e:
            print(f"⚠️ NER extraction error: {e}")
            return []
    
    def _classify_skill(self, skill_name: str) -> str:
        """Classify skill into category"""
        skill_lower = skill_name.lower()
        
        # Check in existing skill categories
        category = self.skill_extractor.skill_categories.get(skill_lower, None)
        if category:
            return category
        
        # Smart classification based on keywords
        tech_keywords = ["python", "java", "javascript", "react", "angular", "vue", "node",
                        "aws", "azure", "gcp", "docker", "kubernetes", "sql", "mongodb",
                        "api", "rest", "graphql", "fastapi", "django", "flask"]
        
        soft_keywords = ["leadership", "communication", "teamwork", "management", "planning",
                        "problem solving", "analytical", "creative", "adaptability"]
        
        language_keywords = ["english", "french", "spanish", "german", "italian", "arabic",
                            "portuguese", "mandarin", "japanese"]
        
        if any(keyword in skill_lower for keyword in tech_keywords):
            return "tech"
        elif any(keyword in skill_lower for keyword in soft_keywords):
            return "soft"
        elif any(keyword in skill_lower for keyword in language_keywords):
            return "language"
        else:
            return "tech"  # Default
    
    def get_extraction_stats(self, skills: List[Dict]) -> Dict:
        """Get statistics about extraction"""
        if not skills:
            return {
                "total": 0,
                "by_source": {},
                "avg_confidence": 0,
                "coverage": 0
            }
        
        by_source = {}
        for skill in skills:
            source = skill.get("source", "unknown")
            by_source[source] = by_source.get(source, 0) + 1
        
        avg_confidence = sum(s.get("confidence", 0) for s in skills) / len(skills)
        
        return {
            "total": len(skills),
            "by_source": by_source,
            "avg_confidence": round(avg_confidence, 3),
            "top_3": [s["name"] for s in skills[:3]]
        }


# Usage example
if __name__ == "__main__":
    extractor = EnhancedSkillExtractor()
    
    sample_text = """
    Python developer with 5 years experience.
    Expert in FastAPI, Django, React, Docker.
    Strong communication and leadership skills.
    Fluent in English and French.
    """
    
    skills = extractor.extract_skills_hybrid(sample_text)
    stats = extractor.get_extraction_stats(skills)
    
    print(f"Found {stats['total']} skills:")
    for skill in skills:
        print(f"  - {skill['name']:20} ({skill['source']:10}) conf: {skill['confidence']:.2f}")
    
    print(f"\nStats: {json.dumps(stats, indent=2)}")