Spaces:

RHmaster
/

ai-talent-finder-backend

Sleeping

ai-talent-finder-backend / test_ner_integration.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 11 days ago

7.87 kB

	"""
	Test NER Integration - Validates complete Étape 5-6 pipeline
	"""

	import sys
	import os
	from pathlib import Path

	# Add backend to path
	sys.path.insert(0, str(Path(__file__).parent))

	# Set a dummy DATABASE_URL before importing app modules
	os.environ['DATABASE_URL'] = 'sqlite:///./test.db'

	from app.models.models import Candidate
	from app.services.cv_extractor import CVExtractionService
	from ai_module.nlp.enhanced_skill_extractor import EnhancedSkillExtractor


	def test_ner_integration():
	"""Test complete NER integration from text to candidate dict"""

	# Sample CV text
	sample_cv = """
	JOHN SMITH
	Email: john.smith@example.com
	Phone: +33 6 12 34 56 78
	LinkedIn: linkedin.com/in/johnsmith

	PROFESSIONAL SUMMARY
	Senior Full Stack Developer with 8 years of experience in web development.

	EXPERIENCE
	Senior Developer - Tech Company Inc (2020-2024)
	- Led team of 5 developers
	- Built microservices using Python and FastAPI
	- Managed PostgreSQL databases

	Junior Developer - Startup LLC (2016-2020)
	- Developed React frontend applications
	- Worked with Node.js backend

	EDUCATION
	Bachelor of Science in Computer Science
	University of Technology (2016)

	SKILLS
	Languages: Python, JavaScript, TypeScript, SQL, HTML/CSS
	Frameworks: FastAPI, React, Django, Node.js
	Databases: PostgreSQL, MongoDB, Redis
	Tools: Docker, Kubernetes, Git, AWS
	Soft Skills: Leadership, Communication, Project Management
	"""

	print("=" * 70)
	print("NER INTEGRATION TEST - Étape 5-6")
	print("=" * 70)

	# Test 1: Create extraction service
	print("\n[TEST 1] Creating CVExtractionService...")
	try:
	service = CVExtractionService()
	print("✅ Service created successfully")
	except Exception as e:
	print(f"❌ Failed to create service: {e}")
	return False

	# Test 2: Extract from text
	print("\n[TEST 2] Extracting structured data from CV text...")
	try:
	result = service.extract_from_text(sample_cv)
	print(f"✅ Extraction completed")
	print(f" - Quality Score: {result.quality_score:.1f}%")
	print(f" - Entities Found: {result.extraction_metadata.get('entities_found', 0)}")
	print(f" - Skills Extracted: {len(result.skills)}")
	structured = result.structured
	print(f" - Experiences: {len(structured.get('experiences', []))}")
	print(f" - Projects: {len(structured.get('projects', []))}")
	except Exception as e:
	print(f"❌ Extraction failed: {e}")
	return False

	# Test 2b: Atypical CV format should still produce meaningful extraction
	print("\n[TEST 2b] Testing atypical CV layout robustness...")
	atypical_cv = """
	Jane Doe \| Data Engineer \| Paris
	contact: jane.doe@mail.com \| +33 7 11 22 33 44
	github.com/janedoe \| janedoe.dev

	2022 - Present \| Data Engineer \| Blue Analytics
	Built ETL pipelines on Airflow and Spark
	Implemented data quality checks and dashboards

	2020-2022 - BI Analyst - Retail Group
	Automated SQL reporting and Power BI models

	Certifications
	AWS Certified Cloud Practitioner
	Scrum Master PSM I

	Projects
	Customer churn prediction using Python and scikit-learn
	"""
	atypical_result = service.extract_from_text(atypical_cv)
	atypical_structured = atypical_result.structured
	if not atypical_structured.get('email'):
	print("❌ Atypical layout: email was not extracted")
	return False
	if not atypical_structured.get('experiences'):
	print("❌ Atypical layout: experiences were not extracted")
	return False
	if not atypical_structured.get('github_urls') and not atypical_structured.get('portfolio_urls'):
	print("❌ Atypical layout: web links were not extracted")
	return False
	print("✅ Atypical layout extraction is robust")

	# Test 3: Convert to candidate dict
	print("\n[TEST 3] Converting extraction result to candidate dict...")
	try:
	candidate_dict = service.to_candidate_dict(result)
	print(f"✅ Conversion successful")
	print(f" - Full Name: {candidate_dict.get('full_name', 'N/A')}")
	print(f" - Email: {candidate_dict.get('email', 'N/A')}")
	print(f" - Extracted Name: {candidate_dict.get('extracted_name', 'N/A')}")
	print(f" - Quality Score: {candidate_dict.get('extraction_quality_score', 0):.1f}%")
	except Exception as e:
	print(f"❌ Conversion failed: {e}")
	return False

	# Test 4: Verify required NER fields
	print("\n[TEST 4] Validating NER fields in candidate dict...")
	ner_fields = [
	'extracted_name',
	'extracted_emails',
	'extracted_phones',
	'extracted_job_titles',
	'extracted_companies',
	'extracted_education',
	'extraction_quality_score',
	'is_fully_extracted'
	]

	missing_fields = []
	for field in ner_fields:
	if field not in candidate_dict:
	missing_fields.append(field)

	if missing_fields:
	print(f"❌ Missing fields: {missing_fields}")
	return False
	else:
	print(f"✅ All NER fields present")

	# Test 4b: Verify rich structured payload contains generalized fields
	print("\n[TEST 4b] Validating extended structured fields...")
	if not isinstance(structured.get('experiences', []), list):
	print("❌ experiences should be a list")
	return False
	if 'projects' not in structured or 'certifications' not in structured:
	print("❌ Missing projects/certifications in structured payload")
	return False
	if 'github_urls' not in structured or 'portfolio_urls' not in structured:
	print("❌ Missing github_urls/portfolio_urls in structured payload")
	return False
	print("✅ Extended structured fields are present")

	# Test 5: Verify EnhancedSkillExtractor
	print("\n[TEST 5] Testing EnhancedSkillExtractor hybrid extraction...")
	try:
	skill_extractor = EnhancedSkillExtractor(load_ner=False)
	skills = skill_extractor.extract_skills_hybrid(sample_cv)
	print(f"✅ Hybrid skill extraction working")
	print(f" - Total skills extracted: {len(skills)}")

	if skills:
	print(f" - Top 3 skills:")
	for skill in skills[:3]:
	print(f" • {skill['name']} ({skill['category']}) - Score: {skill['confidence']:.0%}")
	except Exception as e:
	print(f"❌ Skill extraction failed: {e}")
	# Don't fail on this as NER might not be available
	print(f" (Note: NER may not be available, but fallback should work)")

	# Test 6: Verify model schema
	print("\n[TEST 6] Validating Candidate model schema...")
	try:
	# Check that Candidate class has NER columns
	candidate_columns = {col.name for col in Candidate.__table__.columns}
	ner_columns = {
	'extracted_name', 'extracted_emails', 'extracted_phones',
	'extracted_job_titles', 'extracted_companies', 'extracted_education',
	'extraction_quality_score', 'ner_extraction_data', 'is_fully_extracted'
	}

	missing = ner_columns - candidate_columns
	if missing:
	print(f"❌ Missing columns in Candidate model: {missing}")
	return False
	else:
	print(f"✅ All NER columns present in Candidate model")
	except Exception as e:
	print(f"❌ Schema validation failed: {e}")
	return False

	print("\n" + "=" * 70)
	print("✅ ALL TESTS PASSED - NER Integration Successful!")
	print("=" * 70)
	return True


	if __name__ == "__main__":
	success = test_ner_integration()
	sys.exit(0 if success else 1)