File size: 7,872 Bytes
9df97a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
Test NER Integration - Validates complete Γ‰tape 5-6 pipeline
"""

import sys
import os
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

# Set a dummy DATABASE_URL before importing app modules
os.environ['DATABASE_URL'] = 'sqlite:///./test.db'

from app.models.models import Candidate
from app.services.cv_extractor import CVExtractionService
from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor


def test_ner_integration():
    """Test complete NER integration from text to candidate dict"""
    
    # Sample CV text
    sample_cv = """
    JOHN SMITH
    Email: john.smith@example.com
    Phone: +33 6 12 34 56 78
    LinkedIn: linkedin.com/in/johnsmith
    
    PROFESSIONAL SUMMARY
    Senior Full Stack Developer with 8 years of experience in web development.
    
    EXPERIENCE
    Senior Developer - Tech Company Inc (2020-2024)
    - Led team of 5 developers
    - Built microservices using Python and FastAPI
    - Managed PostgreSQL databases
    
    Junior Developer - Startup LLC (2016-2020)
    - Developed React frontend applications
    - Worked with Node.js backend
    
    EDUCATION
    Bachelor of Science in Computer Science
    University of Technology (2016)
    
    SKILLS
    Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS
    Frameworks: FastAPI, React, Django, Node.js
    Databases: PostgreSQL, MongoDB, Redis
    Tools: Docker, Kubernetes, Git, AWS
    Soft Skills: Leadership, Communication, Project Management
    """
    
    print("=" * 70)
    print("NER INTEGRATION TEST - Γ‰tape 5-6")
    print("=" * 70)
    
    # Test 1: Create extraction service
    print("\n[TEST 1] Creating CVExtractionService...")
    try:
        service = CVExtractionService()
        print("βœ… Service created successfully")
    except Exception as e:
        print(f"❌ Failed to create service: {e}")
        return False
    
    # Test 2: Extract from text
    print("\n[TEST 2] Extracting structured data from CV text...")
    try:
        result = service.extract_from_text(sample_cv)
        print(f"βœ… Extraction completed")
        print(f"   - Quality Score: {result.quality_score:.1f}%")
        print(f"   - Entities Found: {result.extraction_metadata.get('entities_found', 0)}")
        print(f"   - Skills Extracted: {len(result.skills)}")
        structured = result.structured
        print(f"   - Experiences: {len(structured.get('experiences', []))}")
        print(f"   - Projects: {len(structured.get('projects', []))}")
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return False

    # Test 2b: Atypical CV format should still produce meaningful extraction
    print("\n[TEST 2b] Testing atypical CV layout robustness...")
    atypical_cv = """
    Jane Doe | Data Engineer | Paris
    contact: jane.doe@mail.com | +33 7 11 22 33 44
    github.com/janedoe | janedoe.dev

    2022 - Present | Data Engineer | Blue Analytics
    Built ETL pipelines on Airflow and Spark
    Implemented data quality checks and dashboards

    2020-2022 - BI Analyst - Retail Group
    Automated SQL reporting and Power BI models

    Certifications
    AWS Certified Cloud Practitioner
    Scrum Master PSM I

    Projects
    Customer churn prediction using Python and scikit-learn
    """
    atypical_result = service.extract_from_text(atypical_cv)
    atypical_structured = atypical_result.structured
    if not atypical_structured.get('email'):
        print("❌ Atypical layout: email was not extracted")
        return False
    if not atypical_structured.get('experiences'):
        print("❌ Atypical layout: experiences were not extracted")
        return False
    if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'):
        print("❌ Atypical layout: web links were not extracted")
        return False
    print("βœ… Atypical layout extraction is robust")
    
    # Test 3: Convert to candidate dict
    print("\n[TEST 3] Converting extraction result to candidate dict...")
    try:
        candidate_dict = service.to_candidate_dict(result)
        print(f"βœ… Conversion successful")
        print(f"   - Full Name: {candidate_dict.get('full_name', 'N/A')}")
        print(f"   - Email: {candidate_dict.get('email', 'N/A')}")
        print(f"   - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}")
        print(f"   - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%")
    except Exception as e:
        print(f"❌ Conversion failed: {e}")
        return False
    
    # Test 4: Verify required NER fields
    print("\n[TEST 4] Validating NER fields in candidate dict...")
    ner_fields = [
        'extracted_name',
        'extracted_emails',
        'extracted_phones',
        'extracted_job_titles',
        'extracted_companies',
        'extracted_education',
        'extraction_quality_score',
        'is_fully_extracted'
    ]
    
    missing_fields = []
    for field in ner_fields:
        if field not in candidate_dict:
            missing_fields.append(field)
    
    if missing_fields:
        print(f"❌ Missing fields: {missing_fields}")
        return False
    else:
        print(f"βœ… All NER fields present")

    # Test 4b: Verify rich structured payload contains generalized fields
    print("\n[TEST 4b] Validating extended structured fields...")
    if not isinstance(structured.get('experiences', []), list):
        print("❌ experiences should be a list")
        return False
    if 'projects' not in structured or 'certifications' not in structured:
        print("❌ Missing projects/certifications in structured payload")
        return False
    if 'github_urls' not in structured or 'portfolio_urls' not in structured:
        print("❌ Missing github_urls/portfolio_urls in structured payload")
        return False
    print("βœ… Extended structured fields are present")
    
    # Test 5: Verify EnhancedSkillExtractor
    print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...")
    try:
        skill_extractor = EnhancedSkillExtractor(load_ner=False)
        skills = skill_extractor.extract_skills_hybrid(sample_cv)
        print(f"βœ… Hybrid skill extraction working")
        print(f"   - Total skills extracted: {len(skills)}")
        
        if skills:
            print(f"   - Top 3 skills:")
            for skill in skills[:3]:
                print(f"     β€’ {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}")
    except Exception as e:
        print(f"❌ Skill extraction failed: {e}")
        # Don't fail on this as NER might not be available
        print(f"   (Note: NER may not be available, but fallback should work)")
    
    # Test 6: Verify model schema
    print("\n[TEST 6] Validating Candidate model schema...")
    try:
        # Check that Candidate class has NER columns
        candidate_columns = {col.name for col in Candidate.__table__.columns}
        ner_columns = {
            'extracted_name', 'extracted_emails', 'extracted_phones',
            'extracted_job_titles', 'extracted_companies', 'extracted_education',
            'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted'
        }
        
        missing = ner_columns - candidate_columns
        if missing:
            print(f"❌ Missing columns in Candidate model: {missing}")
            return False
        else:
            print(f"βœ… All NER columns present in Candidate model")
    except Exception as e:
        print(f"❌ Schema validation failed: {e}")
        return False
    
    print("\n" + "=" * 70)
    print("βœ… ALL TESTS PASSED - NER Integration Successful!")
    print("=" * 70)
    return True


if __name__ == "__main__":
    success = test_ner_integration()
    sys.exit(0 if success else 1)