Spaces:
Sleeping
Sleeping
| """ | |
| Test NER Integration - Validates complete Γtape 5-6 pipeline | |
| """ | |
| import sys | |
| import os | |
| from pathlib import Path | |
| # Add backend to path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| # Set a dummy DATABASE_URL before importing app modules | |
| os.environ['DATABASE_URL'] = 'sqlite:///./test.db' | |
| from app.models.models import Candidate | |
| from app.services.cv_extractor import CVExtractionService | |
| from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor | |
| def test_ner_integration(): | |
| """Test complete NER integration from text to candidate dict""" | |
| # Sample CV text | |
| sample_cv = """ | |
| JOHN SMITH | |
| Email: john.smith@example.com | |
| Phone: +33 6 12 34 56 78 | |
| LinkedIn: linkedin.com/in/johnsmith | |
| PROFESSIONAL SUMMARY | |
| Senior Full Stack Developer with 8 years of experience in web development. | |
| EXPERIENCE | |
| Senior Developer - Tech Company Inc (2020-2024) | |
| - Led team of 5 developers | |
| - Built microservices using Python and FastAPI | |
| - Managed PostgreSQL databases | |
| Junior Developer - Startup LLC (2016-2020) | |
| - Developed React frontend applications | |
| - Worked with Node.js backend | |
| EDUCATION | |
| Bachelor of Science in Computer Science | |
| University of Technology (2016) | |
| SKILLS | |
| Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS | |
| Frameworks: FastAPI, React, Django, Node.js | |
| Databases: PostgreSQL, MongoDB, Redis | |
| Tools: Docker, Kubernetes, Git, AWS | |
| Soft Skills: Leadership, Communication, Project Management | |
| """ | |
| print("=" * 70) | |
| print("NER INTEGRATION TEST - Γtape 5-6") | |
| print("=" * 70) | |
| # Test 1: Create extraction service | |
| print("\n[TEST 1] Creating CVExtractionService...") | |
| try: | |
| service = CVExtractionService() | |
| print("β Service created successfully") | |
| except Exception as e: | |
| print(f"β Failed to create service: {e}") | |
| return False | |
| # Test 2: Extract from text | |
| print("\n[TEST 2] Extracting structured data from CV text...") | |
| try: | |
| result = service.extract_from_text(sample_cv) | |
| print(f"β Extraction completed") | |
| print(f" - Quality Score: {result.quality_score:.1f}%") | |
| print(f" - Entities Found: {result.extraction_metadata.get('entities_found', 0)}") | |
| print(f" - Skills Extracted: {len(result.skills)}") | |
| structured = result.structured | |
| print(f" - Experiences: {len(structured.get('experiences', []))}") | |
| print(f" - Projects: {len(structured.get('projects', []))}") | |
| except Exception as e: | |
| print(f"β Extraction failed: {e}") | |
| return False | |
| # Test 2b: Atypical CV format should still produce meaningful extraction | |
| print("\n[TEST 2b] Testing atypical CV layout robustness...") | |
| atypical_cv = """ | |
| Jane Doe | Data Engineer | Paris | |
| contact: jane.doe@mail.com | +33 7 11 22 33 44 | |
| github.com/janedoe | janedoe.dev | |
| 2022 - Present | Data Engineer | Blue Analytics | |
| Built ETL pipelines on Airflow and Spark | |
| Implemented data quality checks and dashboards | |
| 2020-2022 - BI Analyst - Retail Group | |
| Automated SQL reporting and Power BI models | |
| Certifications | |
| AWS Certified Cloud Practitioner | |
| Scrum Master PSM I | |
| Projects | |
| Customer churn prediction using Python and scikit-learn | |
| """ | |
| atypical_result = service.extract_from_text(atypical_cv) | |
| atypical_structured = atypical_result.structured | |
| if not atypical_structured.get('email'): | |
| print("β Atypical layout: email was not extracted") | |
| return False | |
| if not atypical_structured.get('experiences'): | |
| print("β Atypical layout: experiences were not extracted") | |
| return False | |
| if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'): | |
| print("β Atypical layout: web links were not extracted") | |
| return False | |
| print("β Atypical layout extraction is robust") | |
| # Test 3: Convert to candidate dict | |
| print("\n[TEST 3] Converting extraction result to candidate dict...") | |
| try: | |
| candidate_dict = service.to_candidate_dict(result) | |
| print(f"β Conversion successful") | |
| print(f" - Full Name: {candidate_dict.get('full_name', 'N/A')}") | |
| print(f" - Email: {candidate_dict.get('email', 'N/A')}") | |
| print(f" - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}") | |
| print(f" - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%") | |
| except Exception as e: | |
| print(f"β Conversion failed: {e}") | |
| return False | |
| # Test 4: Verify required NER fields | |
| print("\n[TEST 4] Validating NER fields in candidate dict...") | |
| ner_fields = [ | |
| 'extracted_name', | |
| 'extracted_emails', | |
| 'extracted_phones', | |
| 'extracted_job_titles', | |
| 'extracted_companies', | |
| 'extracted_education', | |
| 'extraction_quality_score', | |
| 'is_fully_extracted' | |
| ] | |
| missing_fields = [] | |
| for field in ner_fields: | |
| if field not in candidate_dict: | |
| missing_fields.append(field) | |
| if missing_fields: | |
| print(f"β Missing fields: {missing_fields}") | |
| return False | |
| else: | |
| print(f"β All NER fields present") | |
| # Test 4b: Verify rich structured payload contains generalized fields | |
| print("\n[TEST 4b] Validating extended structured fields...") | |
| if not isinstance(structured.get('experiences', []), list): | |
| print("β experiences should be a list") | |
| return False | |
| if 'projects' not in structured or 'certifications' not in structured: | |
| print("β Missing projects/certifications in structured payload") | |
| return False | |
| if 'github_urls' not in structured or 'portfolio_urls' not in structured: | |
| print("β Missing github_urls/portfolio_urls in structured payload") | |
| return False | |
| print("β Extended structured fields are present") | |
| # Test 5: Verify EnhancedSkillExtractor | |
| print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...") | |
| try: | |
| skill_extractor = EnhancedSkillExtractor(load_ner=False) | |
| skills = skill_extractor.extract_skills_hybrid(sample_cv) | |
| print(f"β Hybrid skill extraction working") | |
| print(f" - Total skills extracted: {len(skills)}") | |
| if skills: | |
| print(f" - Top 3 skills:") | |
| for skill in skills[:3]: | |
| print(f" β’ {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}") | |
| except Exception as e: | |
| print(f"β Skill extraction failed: {e}") | |
| # Don't fail on this as NER might not be available | |
| print(f" (Note: NER may not be available, but fallback should work)") | |
| # Test 6: Verify model schema | |
| print("\n[TEST 6] Validating Candidate model schema...") | |
| try: | |
| # Check that Candidate class has NER columns | |
| candidate_columns = {col.name for col in Candidate.__table__.columns} | |
| ner_columns = { | |
| 'extracted_name', 'extracted_emails', 'extracted_phones', | |
| 'extracted_job_titles', 'extracted_companies', 'extracted_education', | |
| 'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted' | |
| } | |
| missing = ner_columns - candidate_columns | |
| if missing: | |
| print(f"β Missing columns in Candidate model: {missing}") | |
| return False | |
| else: | |
| print(f"β All NER columns present in Candidate model") | |
| except Exception as e: | |
| print(f"β Schema validation failed: {e}") | |
| return False | |
| print("\n" + "=" * 70) | |
| print("β ALL TESTS PASSED - NER Integration Successful!") | |
| print("=" * 70) | |
| return True | |
| if __name__ == "__main__": | |
| success = test_ner_integration() | |
| sys.exit(0 if success else 1) | |