Spaces:

RHmaster
/

ai-talent-finder-backend

Running

File size: 7,872 Bytes

9df97a2

"""
Test NER Integration - Validates complete Étape 5-6 pipeline
"""

import sys
import os
from pathlib import Path

# Add backend to path
sys.path.insert(0, str(Path(__file__).parent))

# Set a dummy DATABASE_URL before importing app modules
os.environ['DATABASE_URL'] = 'sqlite:///./test.db'

from app.models.models import Candidate
from app.services.cv_extractor import CVExtractionService
from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor


def test_ner_integration():
    """Test complete NER integration from text to candidate dict"""
    
    # Sample CV text
    sample_cv = """
    JOHN SMITH
    Email: john.smith@example.com
    Phone: +33 6 12 34 56 78
    LinkedIn: linkedin.com/in/johnsmith
    
    PROFESSIONAL SUMMARY
    Senior Full Stack Developer with 8 years of experience in web development.
    
    EXPERIENCE
    Senior Developer - Tech Company Inc (2020-2024)
    - Led team of 5 developers
    - Built microservices using Python and FastAPI
    - Managed PostgreSQL databases
    
    Junior Developer - Startup LLC (2016-2020)
    - Developed React frontend applications
    - Worked with Node.js backend
    
    EDUCATION
    Bachelor of Science in Computer Science
    University of Technology (2016)
    
    SKILLS
    Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS
    Frameworks: FastAPI, React, Django, Node.js
    Databases: PostgreSQL, MongoDB, Redis
    Tools: Docker, Kubernetes, Git, AWS
    Soft Skills: Leadership, Communication, Project Management
    """
    
    print("=" * 70)
    print("NER INTEGRATION TEST - Étape 5-6")
    print("=" * 70)
    
    # Test 1: Create extraction service
    print("\n[TEST 1] Creating CVExtractionService...")
    try:
        service = CVExtractionService()
        print("✅ Service created successfully")
    except Exception as e:
        print(f"❌ Failed to create service: {e}")
        return False
    
    # Test 2: Extract from text
    print("\n[TEST 2] Extracting structured data from CV text...")
    try:
        result = service.extract_from_text(sample_cv)
        print(f"✅ Extraction completed")
        print(f"   - Quality Score: {result.quality_score:.1f}%")
        print(f"   - Entities Found: {result.extraction_metadata.get('entities_found', 0)}")
        print(f"   - Skills Extracted: {len(result.skills)}")
        structured = result.structured
        print(f"   - Experiences: {len(structured.get('experiences', []))}")
        print(f"   - Projects: {len(structured.get('projects', []))}")
    except Exception as e:
        print(f"❌ Extraction failed: {e}")
        return False

    # Test 2b: Atypical CV format should still produce meaningful extraction
    print("\n[TEST 2b] Testing atypical CV layout robustness...")
    atypical_cv = """
    Jane Doe | Data Engineer | Paris
    contact: jane.doe@mail.com | +33 7 11 22 33 44
    github.com/janedoe | janedoe.dev

    2022 - Present | Data Engineer | Blue Analytics
    Built ETL pipelines on Airflow and Spark
    Implemented data quality checks and dashboards

    2020-2022 - BI Analyst - Retail Group
    Automated SQL reporting and Power BI models

    Certifications
    AWS Certified Cloud Practitioner
    Scrum Master PSM I

    Projects
    Customer churn prediction using Python and scikit-learn
    """
    atypical_result = service.extract_from_text(atypical_cv)
    atypical_structured = atypical_result.structured
    if not atypical_structured.get('email'):
        print("❌ Atypical layout: email was not extracted")
        return False
    if not atypical_structured.get('experiences'):
        print("❌ Atypical layout: experiences were not extracted")
        return False
    if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'):
        print("❌ Atypical layout: web links were not extracted")
        return False
    print("✅ Atypical layout extraction is robust")
    
    # Test 3: Convert to candidate dict
    print("\n[TEST 3] Converting extraction result to candidate dict...")
    try:
        candidate_dict = service.to_candidate_dict(result)
        print(f"✅ Conversion successful")
        print(f"   - Full Name: {candidate_dict.get('full_name', 'N/A')}")
        print(f"   - Email: {candidate_dict.get('email', 'N/A')}")
        print(f"   - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}")
        print(f"   - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%")
    except Exception as e:
        print(f"❌ Conversion failed: {e}")
        return False
    
    # Test 4: Verify required NER fields
    print("\n[TEST 4] Validating NER fields in candidate dict...")
    ner_fields = [
        'extracted_name',
        'extracted_emails',
        'extracted_phones',
        'extracted_job_titles',
        'extracted_companies',
        'extracted_education',
        'extraction_quality_score',
        'is_fully_extracted'
    ]
    
    missing_fields = []
    for field in ner_fields:
        if field not in candidate_dict:
            missing_fields.append(field)
    
    if missing_fields:
        print(f"❌ Missing fields: {missing_fields}")
        return False
    else:
        print(f"✅ All NER fields present")

    # Test 4b: Verify rich structured payload contains generalized fields
    print("\n[TEST 4b] Validating extended structured fields...")
    if not isinstance(structured.get('experiences', []), list):
        print("❌ experiences should be a list")
        return False
    if 'projects' not in structured or 'certifications' not in structured:
        print("❌ Missing projects/certifications in structured payload")
        return False
    if 'github_urls' not in structured or 'portfolio_urls' not in structured:
        print("❌ Missing github_urls/portfolio_urls in structured payload")
        return False
    print("✅ Extended structured fields are present")
    
    # Test 5: Verify EnhancedSkillExtractor
    print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...")
    try:
        skill_extractor = EnhancedSkillExtractor(load_ner=False)
        skills = skill_extractor.extract_skills_hybrid(sample_cv)
        print(f"✅ Hybrid skill extraction working")
        print(f"   - Total skills extracted: {len(skills)}")
        
        if skills:
            print(f"   - Top 3 skills:")
            for skill in skills[:3]:
                print(f"     • {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}")
    except Exception as e:
        print(f"❌ Skill extraction failed: {e}")
        # Don't fail on this as NER might not be available
        print(f"   (Note: NER may not be available, but fallback should work)")
    
    # Test 6: Verify model schema
    print("\n[TEST 6] Validating Candidate model schema...")
    try:
        # Check that Candidate class has NER columns
        candidate_columns = {col.name for col in Candidate.__table__.columns}
        ner_columns = {
            'extracted_name', 'extracted_emails', 'extracted_phones',
            'extracted_job_titles', 'extracted_companies', 'extracted_education',
            'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted'
        }
        
        missing = ner_columns - candidate_columns
        if missing:
            print(f"❌ Missing columns in Candidate model: {missing}")
            return False
        else:
            print(f"✅ All NER columns present in Candidate model")
    except Exception as e:
        print(f"❌ Schema validation failed: {e}")
        return False
    
    print("\n" + "=" * 70)
    print("✅ ALL TESTS PASSED - NER Integration Successful!")
    print("=" * 70)
    return True


if __name__ == "__main__":
    success = test_ner_integration()
    sys.exit(0 if success else 1)