ai-talent-finder-backend / test_ner_integration.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
7.87 kB
"""
Test NER Integration - Validates complete Γ‰tape 5-6 pipeline
"""
import sys
import os
from pathlib import Path
# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))
# Set a dummy DATABASE_URL before importing app modules
os.environ['DATABASE_URL'] = 'sqlite:///./test.db'
from app.models.models import Candidate
from app.services.cv_extractor import CVExtractionService
from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor
def test_ner_integration():
"""Test complete NER integration from text to candidate dict"""
# Sample CV text
sample_cv = """
JOHN SMITH
Email: john.smith@example.com
Phone: +33 6 12 34 56 78
LinkedIn: linkedin.com/in/johnsmith
PROFESSIONAL SUMMARY
Senior Full Stack Developer with 8 years of experience in web development.
EXPERIENCE
Senior Developer - Tech Company Inc (2020-2024)
- Led team of 5 developers
- Built microservices using Python and FastAPI
- Managed PostgreSQL databases
Junior Developer - Startup LLC (2016-2020)
- Developed React frontend applications
- Worked with Node.js backend
EDUCATION
Bachelor of Science in Computer Science
University of Technology (2016)
SKILLS
Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS
Frameworks: FastAPI, React, Django, Node.js
Databases: PostgreSQL, MongoDB, Redis
Tools: Docker, Kubernetes, Git, AWS
Soft Skills: Leadership, Communication, Project Management
"""
print("=" * 70)
print("NER INTEGRATION TEST - Γ‰tape 5-6")
print("=" * 70)
# Test 1: Create extraction service
print("\n[TEST 1] Creating CVExtractionService...")
try:
service = CVExtractionService()
print("βœ… Service created successfully")
except Exception as e:
print(f"❌ Failed to create service: {e}")
return False
# Test 2: Extract from text
print("\n[TEST 2] Extracting structured data from CV text...")
try:
result = service.extract_from_text(sample_cv)
print(f"βœ… Extraction completed")
print(f" - Quality Score: {result.quality_score:.1f}%")
print(f" - Entities Found: {result.extraction_metadata.get('entities_found', 0)}")
print(f" - Skills Extracted: {len(result.skills)}")
structured = result.structured
print(f" - Experiences: {len(structured.get('experiences', []))}")
print(f" - Projects: {len(structured.get('projects', []))}")
except Exception as e:
print(f"❌ Extraction failed: {e}")
return False
# Test 2b: Atypical CV format should still produce meaningful extraction
print("\n[TEST 2b] Testing atypical CV layout robustness...")
atypical_cv = """
Jane Doe | Data Engineer | Paris
contact: jane.doe@mail.com | +33 7 11 22 33 44
github.com/janedoe | janedoe.dev
2022 - Present | Data Engineer | Blue Analytics
Built ETL pipelines on Airflow and Spark
Implemented data quality checks and dashboards
2020-2022 - BI Analyst - Retail Group
Automated SQL reporting and Power BI models
Certifications
AWS Certified Cloud Practitioner
Scrum Master PSM I
Projects
Customer churn prediction using Python and scikit-learn
"""
atypical_result = service.extract_from_text(atypical_cv)
atypical_structured = atypical_result.structured
if not atypical_structured.get('email'):
print("❌ Atypical layout: email was not extracted")
return False
if not atypical_structured.get('experiences'):
print("❌ Atypical layout: experiences were not extracted")
return False
if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'):
print("❌ Atypical layout: web links were not extracted")
return False
print("βœ… Atypical layout extraction is robust")
# Test 3: Convert to candidate dict
print("\n[TEST 3] Converting extraction result to candidate dict...")
try:
candidate_dict = service.to_candidate_dict(result)
print(f"βœ… Conversion successful")
print(f" - Full Name: {candidate_dict.get('full_name', 'N/A')}")
print(f" - Email: {candidate_dict.get('email', 'N/A')}")
print(f" - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}")
print(f" - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%")
except Exception as e:
print(f"❌ Conversion failed: {e}")
return False
# Test 4: Verify required NER fields
print("\n[TEST 4] Validating NER fields in candidate dict...")
ner_fields = [
'extracted_name',
'extracted_emails',
'extracted_phones',
'extracted_job_titles',
'extracted_companies',
'extracted_education',
'extraction_quality_score',
'is_fully_extracted'
]
missing_fields = []
for field in ner_fields:
if field not in candidate_dict:
missing_fields.append(field)
if missing_fields:
print(f"❌ Missing fields: {missing_fields}")
return False
else:
print(f"βœ… All NER fields present")
# Test 4b: Verify rich structured payload contains generalized fields
print("\n[TEST 4b] Validating extended structured fields...")
if not isinstance(structured.get('experiences', []), list):
print("❌ experiences should be a list")
return False
if 'projects' not in structured or 'certifications' not in structured:
print("❌ Missing projects/certifications in structured payload")
return False
if 'github_urls' not in structured or 'portfolio_urls' not in structured:
print("❌ Missing github_urls/portfolio_urls in structured payload")
return False
print("βœ… Extended structured fields are present")
# Test 5: Verify EnhancedSkillExtractor
print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...")
try:
skill_extractor = EnhancedSkillExtractor(load_ner=False)
skills = skill_extractor.extract_skills_hybrid(sample_cv)
print(f"βœ… Hybrid skill extraction working")
print(f" - Total skills extracted: {len(skills)}")
if skills:
print(f" - Top 3 skills:")
for skill in skills[:3]:
print(f" β€’ {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}")
except Exception as e:
print(f"❌ Skill extraction failed: {e}")
# Don't fail on this as NER might not be available
print(f" (Note: NER may not be available, but fallback should work)")
# Test 6: Verify model schema
print("\n[TEST 6] Validating Candidate model schema...")
try:
# Check that Candidate class has NER columns
candidate_columns = {col.name for col in Candidate.__table__.columns}
ner_columns = {
'extracted_name', 'extracted_emails', 'extracted_phones',
'extracted_job_titles', 'extracted_companies', 'extracted_education',
'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted'
}
missing = ner_columns - candidate_columns
if missing:
print(f"❌ Missing columns in Candidate model: {missing}")
return False
else:
print(f"βœ… All NER columns present in Candidate model")
except Exception as e:
print(f"❌ Schema validation failed: {e}")
return False
print("\n" + "=" * 70)
print("βœ… ALL TESTS PASSED - NER Integration Successful!")
print("=" * 70)
return True
if __name__ == "__main__":
success = test_ner_integration()
sys.exit(0 if success else 1)