File size: 7,872 Bytes
9df97a2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | """
Test NER Integration - Validates complete Γtape 5-6 pipeline
"""
import sys
import os
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
# Set a dummy DATABASE_URL before importing app modules
os.environ['DATABASE_URL'] = 'sqlite:///./test.db'
from app.models.models import Candidate
from app.services.cv_extractor import CVExtractionService
from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor
def test_ner_integration():
"""Test complete NER integration from text to candidate dict"""
# Sample CV text
sample_cv = """
JOHN SMITH
Email: john.smith@example.com
Phone: +33 6 12 34 56 78
LinkedIn: linkedin.com/in/johnsmith
PROFESSIONAL SUMMARY
Senior Full Stack Developer with 8 years of experience in web development.
EXPERIENCE
Senior Developer - Tech Company Inc (2020-2024)
- Led team of 5 developers
- Built microservices using Python and FastAPI
- Managed PostgreSQL databases
Junior Developer - Startup LLC (2016-2020)
- Developed React frontend applications
- Worked with Node.js backend
EDUCATION
Bachelor of Science in Computer Science
University of Technology (2016)
SKILLS
Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS
Frameworks: FastAPI, React, Django, Node.js
Databases: PostgreSQL, MongoDB, Redis
Tools: Docker, Kubernetes, Git, AWS
Soft Skills: Leadership, Communication, Project Management
"""
print("=" * 70)
print("NER INTEGRATION TEST - Γtape 5-6")
print("=" * 70)
# Test 1: Create extraction service
print("\n[TEST 1] Creating CVExtractionService...")
try:
service = CVExtractionService()
print("β
Service created successfully")
except Exception as e:
print(f"β Failed to create service: {e}")
return False
# Test 2: Extract from text
print("\n[TEST 2] Extracting structured data from CV text...")
try:
result = service.extract_from_text(sample_cv)
print(f"β
Extraction completed")
print(f" - Quality Score: {result.quality_score:.1f}%")
print(f" - Entities Found: {result.extraction_metadata.get('entities_found', 0)}")
print(f" - Skills Extracted: {len(result.skills)}")
structured = result.structured
print(f" - Experiences: {len(structured.get('experiences', []))}")
print(f" - Projects: {len(structured.get('projects', []))}")
except Exception as e:
print(f"β Extraction failed: {e}")
return False
# Test 2b: Atypical CV format should still produce meaningful extraction
print("\n[TEST 2b] Testing atypical CV layout robustness...")
atypical_cv = """
Jane Doe | Data Engineer | Paris
contact: jane.doe@mail.com | +33 7 11 22 33 44
github.com/janedoe | janedoe.dev
2022 - Present | Data Engineer | Blue Analytics
Built ETL pipelines on Airflow and Spark
Implemented data quality checks and dashboards
2020-2022 - BI Analyst - Retail Group
Automated SQL reporting and Power BI models
Certifications
AWS Certified Cloud Practitioner
Scrum Master PSM I
Projects
Customer churn prediction using Python and scikit-learn
"""
atypical_result = service.extract_from_text(atypical_cv)
atypical_structured = atypical_result.structured
if not atypical_structured.get('email'):
print("β Atypical layout: email was not extracted")
return False
if not atypical_structured.get('experiences'):
print("β Atypical layout: experiences were not extracted")
return False
if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'):
print("β Atypical layout: web links were not extracted")
return False
print("β
Atypical layout extraction is robust")
# Test 3: Convert to candidate dict
print("\n[TEST 3] Converting extraction result to candidate dict...")
try:
candidate_dict = service.to_candidate_dict(result)
print(f"β
Conversion successful")
print(f" - Full Name: {candidate_dict.get('full_name', 'N/A')}")
print(f" - Email: {candidate_dict.get('email', 'N/A')}")
print(f" - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}")
print(f" - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%")
except Exception as e:
print(f"β Conversion failed: {e}")
return False
# Test 4: Verify required NER fields
print("\n[TEST 4] Validating NER fields in candidate dict...")
ner_fields = [
'extracted_name',
'extracted_emails',
'extracted_phones',
'extracted_job_titles',
'extracted_companies',
'extracted_education',
'extraction_quality_score',
'is_fully_extracted'
]
missing_fields = []
for field in ner_fields:
if field not in candidate_dict:
missing_fields.append(field)
if missing_fields:
print(f"β Missing fields: {missing_fields}")
return False
else:
print(f"β
All NER fields present")
# Test 4b: Verify rich structured payload contains generalized fields
print("\n[TEST 4b] Validating extended structured fields...")
if not isinstance(structured.get('experiences', []), list):
print("β experiences should be a list")
return False
if 'projects' not in structured or 'certifications' not in structured:
print("β Missing projects/certifications in structured payload")
return False
if 'github_urls' not in structured or 'portfolio_urls' not in structured:
print("β Missing github_urls/portfolio_urls in structured payload")
return False
print("β
Extended structured fields are present")
# Test 5: Verify EnhancedSkillExtractor
print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...")
try:
skill_extractor = EnhancedSkillExtractor(load_ner=False)
skills = skill_extractor.extract_skills_hybrid(sample_cv)
print(f"β
Hybrid skill extraction working")
print(f" - Total skills extracted: {len(skills)}")
if skills:
print(f" - Top 3 skills:")
for skill in skills[:3]:
print(f" β’ {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}")
except Exception as e:
print(f"β Skill extraction failed: {e}")
# Don't fail on this as NER might not be available
print(f" (Note: NER may not be available, but fallback should work)")
# Test 6: Verify model schema
print("\n[TEST 6] Validating Candidate model schema...")
try:
# Check that Candidate class has NER columns
candidate_columns = {col.name for col in Candidate.__table__.columns}
ner_columns = {
'extracted_name', 'extracted_emails', 'extracted_phones',
'extracted_job_titles', 'extracted_companies', 'extracted_education',
'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted'
}
missing = ner_columns - candidate_columns
if missing:
print(f"β Missing columns in Candidate model: {missing}")
return False
else:
print(f"β
All NER columns present in Candidate model")
except Exception as e:
print(f"β Schema validation failed: {e}")
return False
print("\n" + "=" * 70)
print("β
ALL TESTS PASSED - NER Integration Successful!")
print("=" * 70)
return True
if __name__ == "__main__":
success = test_ner_integration()
sys.exit(0 if success else 1)
|