| """ |
| Test NER Integration - Validates complete Γtape 5-6 pipeline |
| """ |
|
|
| import sys |
| import os |
| from pathlib import Path |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| |
| os.environ['DATABASE_URL'] = 'sqlite:///./test.db' |
|
|
| from app.models.models import Candidate |
| from app.services.cv_extractor import CVExtractionService |
| from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor |
|
|
|
|
| def test_ner_integration(): |
| """Test complete NER integration from text to candidate dict""" |
| |
| |
| sample_cv = """ |
| JOHN SMITH |
| Email: john.smith@example.com |
| Phone: +33 6 12 34 56 78 |
| LinkedIn: linkedin.com/in/johnsmith |
| |
| PROFESSIONAL SUMMARY |
| Senior Full Stack Developer with 8 years of experience in web development. |
| |
| EXPERIENCE |
| Senior Developer - Tech Company Inc (2020-2024) |
| - Led team of 5 developers |
| - Built microservices using Python and FastAPI |
| - Managed PostgreSQL databases |
| |
| Junior Developer - Startup LLC (2016-2020) |
| - Developed React frontend applications |
| - Worked with Node.js backend |
| |
| EDUCATION |
| Bachelor of Science in Computer Science |
| University of Technology (2016) |
| |
| SKILLS |
| Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS |
| Frameworks: FastAPI, React, Django, Node.js |
| Databases: PostgreSQL, MongoDB, Redis |
| Tools: Docker, Kubernetes, Git, AWS |
| Soft Skills: Leadership, Communication, Project Management |
| """ |
| |
| print("=" * 70) |
| print("NER INTEGRATION TEST - Γtape 5-6") |
| print("=" * 70) |
| |
| |
| print("\n[TEST 1] Creating CVExtractionService...") |
| try: |
| service = CVExtractionService() |
| print("β
Service created successfully") |
| except Exception as e: |
| print(f"β Failed to create service: {e}") |
| return False |
| |
| |
| print("\n[TEST 2] Extracting structured data from CV text...") |
| try: |
| result = service.extract_from_text(sample_cv) |
| print(f"β
Extraction completed") |
| print(f" - Quality Score: {result.quality_score:.1f}%") |
| print(f" - Entities Found: {result.extraction_metadata.get('entities_found', 0)}") |
| print(f" - Skills Extracted: {len(result.skills)}") |
| structured = result.structured |
| print(f" - Experiences: {len(structured.get('experiences', []))}") |
| print(f" - Projects: {len(structured.get('projects', []))}") |
| except Exception as e: |
| print(f"β Extraction failed: {e}") |
| return False |
|
|
| |
| print("\n[TEST 2b] Testing atypical CV layout robustness...") |
| atypical_cv = """ |
| Jane Doe | Data Engineer | Paris |
| contact: jane.doe@mail.com | +33 7 11 22 33 44 |
| github.com/janedoe | janedoe.dev |
| |
| 2022 - Present | Data Engineer | Blue Analytics |
| Built ETL pipelines on Airflow and Spark |
| Implemented data quality checks and dashboards |
| |
| 2020-2022 - BI Analyst - Retail Group |
| Automated SQL reporting and Power BI models |
| |
| Certifications |
| AWS Certified Cloud Practitioner |
| Scrum Master PSM I |
| |
| Projects |
| Customer churn prediction using Python and scikit-learn |
| """ |
| atypical_result = service.extract_from_text(atypical_cv) |
| atypical_structured = atypical_result.structured |
| if not atypical_structured.get('email'): |
| print("β Atypical layout: email was not extracted") |
| return False |
| if not atypical_structured.get('experiences'): |
| print("β Atypical layout: experiences were not extracted") |
| return False |
| if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'): |
| print("β Atypical layout: web links were not extracted") |
| return False |
| print("β
Atypical layout extraction is robust") |
| |
| |
| print("\n[TEST 3] Converting extraction result to candidate dict...") |
| try: |
| candidate_dict = service.to_candidate_dict(result) |
| print(f"β
Conversion successful") |
| print(f" - Full Name: {candidate_dict.get('full_name', 'N/A')}") |
| print(f" - Email: {candidate_dict.get('email', 'N/A')}") |
| print(f" - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}") |
| print(f" - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%") |
| except Exception as e: |
| print(f"β Conversion failed: {e}") |
| return False |
| |
| |
| print("\n[TEST 4] Validating NER fields in candidate dict...") |
| ner_fields = [ |
| 'extracted_name', |
| 'extracted_emails', |
| 'extracted_phones', |
| 'extracted_job_titles', |
| 'extracted_companies', |
| 'extracted_education', |
| 'extraction_quality_score', |
| 'is_fully_extracted' |
| ] |
| |
| missing_fields = [] |
| for field in ner_fields: |
| if field not in candidate_dict: |
| missing_fields.append(field) |
| |
| if missing_fields: |
| print(f"β Missing fields: {missing_fields}") |
| return False |
| else: |
| print(f"β
All NER fields present") |
|
|
| |
| print("\n[TEST 4b] Validating extended structured fields...") |
| if not isinstance(structured.get('experiences', []), list): |
| print("β experiences should be a list") |
| return False |
| if 'projects' not in structured or 'certifications' not in structured: |
| print("β Missing projects/certifications in structured payload") |
| return False |
| if 'github_urls' not in structured or 'portfolio_urls' not in structured: |
| print("β Missing github_urls/portfolio_urls in structured payload") |
| return False |
| print("β
Extended structured fields are present") |
| |
| |
| print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...") |
| try: |
| skill_extractor = EnhancedSkillExtractor(load_ner=False) |
| skills = skill_extractor.extract_skills_hybrid(sample_cv) |
| print(f"β
Hybrid skill extraction working") |
| print(f" - Total skills extracted: {len(skills)}") |
| |
| if skills: |
| print(f" - Top 3 skills:") |
| for skill in skills[:3]: |
| print(f" β’ {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}") |
| except Exception as e: |
| print(f"β Skill extraction failed: {e}") |
| |
| print(f" (Note: NER may not be available, but fallback should work)") |
| |
| |
| print("\n[TEST 6] Validating Candidate model schema...") |
| try: |
| |
| candidate_columns = {col.name for col in Candidate.__table__.columns} |
| ner_columns = { |
| 'extracted_name', 'extracted_emails', 'extracted_phones', |
| 'extracted_job_titles', 'extracted_companies', 'extracted_education', |
| 'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted' |
| } |
| |
| missing = ner_columns - candidate_columns |
| if missing: |
| print(f"β Missing columns in Candidate model: {missing}") |
| return False |
| else: |
| print(f"β
All NER columns present in Candidate model") |
| except Exception as e: |
| print(f"β Schema validation failed: {e}") |
| return False |
| |
| print("\n" + "=" * 70) |
| print("β
ALL TESTS PASSED - NER Integration Successful!") |
| print("=" * 70) |
| return True |
|
|
|
|
| if __name__ == "__main__": |
| success = test_ner_integration() |
| sys.exit(0 if success else 1) |
|
|