ai-talent-finder-backend / ai_module /nlp /resume_ner_extractor 2.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
13.4 kB
# Resume NER Extractor - Implémentation Prête à l'Emploi
# Fichier: backend/ai_module/nlp/resume_ner_extractor.py
"""
Resume Parsing using BERT-based Named Entity Recognition
Modèle: AventIQ-AI/Resume-Parsing-NER-AI-Model
Utile pour: Extraction complète de CV (nom, email, compétences, expérience, etc.)
"""
import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
try:
from transformers import pipeline
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
print("⚠️ Warning: transformers not installed. Install with: pip install transformers torch")
@dataclass
class ExtractedEntity:
"""Container for a single extracted entity"""
text: str
entity_type: str # NAME, EMAIL, PHONE, JOB, COMPANY, SKILL, EDUCATION
confidence: float
start_char: int = 0
end_char: int = 0
class ResumeNERExtractor:
"""
Extract structured data from resume using BERT-based NER model
Supported entity types:
- NAME: Candidate's full name
- EMAIL: Email addresses
- PHONE: Phone numbers
- JOB: Job titles (Senior Developer, Manager, etc.)
- COMPANY: Company names
- SKILL: Technical/soft skills
- EDUCATION: Educational degrees and qualifications
"""
MODEL_NAME = "AventIQ-AI/Resume-Parsing-NER-AI-Model"
MIN_CONFIDENCE = 0.75 # Only extract if confidence > 75%
MAX_TEXT_LENGTH = 512 # BERT max input
# Mapping from model labels to normalized categories
ENTITY_MAPPING = {
"B-NAME": "NAME",
"I-NAME": "NAME",
"B-EMAIL": "EMAIL",
"I-EMAIL": "EMAIL",
"B-PHONE": "PHONE",
"I-PHONE": "PHONE",
"B-EDUCATION": "EDUCATION",
"I-EDUCATION": "EDUCATION",
"B-SKILL": "SKILL",
"I-SKILL": "SKILL",
"B-JOB": "JOB",
"I-JOB": "JOB",
"B-COMPANY": "COMPANY",
"I-COMPANY": "COMPANY",
"O": "OTHER"
}
def __init__(self):
"""Initialize the NER pipeline"""
if not TRANSFORMERS_AVAILABLE:
raise ImportError(
"Transformers library required. Install with: pip install transformers torch"
)
print("📍 Loading Resume NER model... (first time may take a minute)")
try:
self.ner_pipeline = pipeline(
"ner",
model=self.MODEL_NAME,
aggregation_strategy="simple" # Keep subwords separate
)
print("✅ Model loaded successfully!")
except Exception as e:
print(f"❌ Error loading model: {e}")
raise
def extract_all_entities(self, text: str) -> Dict[str, List[ExtractedEntity]]:
"""
Extract all entities from resume text
Args:
text: Resume text (can be multi-paragraph)
Returns:
Dictionary with entity types as keys and list of ExtractedEntity as values
{
"NAME": [ExtractedEntity(...), ...],
"EMAIL": [ExtractedEntity(...), ...],
"SKILL": [...],
...
}
"""
if not text or len(text.strip()) == 0:
return {}
# Truncate to max length
text = text[:self.MAX_TEXT_LENGTH]
try:
# Run NER pipeline
ner_results = self.ner_pipeline(text)
except Exception as e:
print(f"❌ NER extraction failed: {e}")
return {}
# Parse results into structured format
entities = self._parse_ner_results(ner_results, text)
# Group by entity type
grouped = {}
for entity in entities:
if entity.entity_type not in grouped:
grouped[entity.entity_type] = []
grouped[entity.entity_type].append(entity)
return grouped
def _parse_ner_results(self, ner_results: list, original_text: str) -> List[ExtractedEntity]:
"""
Parse raw NER pipeline results into ExtractedEntity objects
Args:
ner_results: Output from transformers NER pipeline
original_text: Original text (for position tracking)
Returns:
List of ExtractedEntity objects
"""
entities = []
current_entity = None
for result in ner_results:
token = result["word"]
label = result["entity"]
score = result["score"]
# Normalize label
entity_type = self.ENTITY_MAPPING.get(label, "OTHER")
# Skip if confidence too low
if score < self.MIN_CONFIDENCE:
continue
# Skip "other" entities
if entity_type == "OTHER":
if current_entity:
entities.append(current_entity)
current_entity = None
continue
# Handle B- (Beginning) tags
if label.startswith("B-"):
# Save previous entity if exists
if current_entity:
entities.append(current_entity)
# Start new entity
current_entity = ExtractedEntity(
text=token,
entity_type=entity_type,
confidence=score
)
# Handle I- (Inside/continuation) tags
elif label.startswith("I-") and current_entity:
# Continue current entity
if current_entity.entity_type == entity_type:
# Merge with space if needed
current_entity.text += f" {token}" if not token.startswith("##") else token.replace("##", "")
# Use minimum confidence
current_entity.confidence = min(current_entity.confidence, score)
else:
# Entity type changed, save previous
entities.append(current_entity)
current_entity = ExtractedEntity(
text=token,
entity_type=entity_type,
confidence=score
)
# Don't forget last entity
if current_entity:
entities.append(current_entity)
return entities
def extract_structured_profile(self, text: str) -> Dict:
"""
Extract resume data into structured candidate profile
Returns:
{
"name": str or None,
"emails": List[str],
"phones": List[str],
"job_titles": List[{"title": str, "confidence": float}],
"companies": List[{"name": str, "confidence": float}],
"skills": List[{"name": str, "confidence": float}],
"education": List[{"degree": str, "confidence": float}],
"quality_score": float (0-100),
"extraction_metadata": {...}
}
"""
# Extract all entities
entities = self.extract_all_entities(text)
# Build structured profile
profile = {
"name": None,
"emails": [],
"phones": [],
"job_titles": [],
"companies": [],
"skills": [],
"education": [],
"extraction_metadata": {
"total_entities_found": sum(len(v) for v in entities.values()),
"entity_types_found": list(entities.keys()),
"confidence_scores": {}
}
}
# Process NAME
if "NAME" in entities and entities["NAME"]:
name_entity = max(entities["NAME"], key=lambda e: e.confidence)
profile["name"] = name_entity.text.strip()
profile["extraction_metadata"]["confidence_scores"]["name"] = name_entity.confidence
# Process EMAIL
if "EMAIL" in entities:
profile["emails"] = [
e.text.strip() for e in entities["EMAIL"]
]
# Process PHONE
if "PHONE" in entities:
profile["phones"] = [
e.text.strip() for e in entities["PHONE"]
]
# Process JOB TITLES
if "JOB" in entities:
profile["job_titles"] = [
{
"title": e.text.strip(),
"confidence": e.confidence
}
for e in entities["JOB"]
]
# Process COMPANIES
if "COMPANY" in entities:
profile["companies"] = [
{
"name": e.text.strip(),
"confidence": e.confidence
}
for e in entities["COMPANY"]
]
# Process SKILLS
if "SKILL" in entities:
profile["skills"] = [
{
"name": e.text.strip(),
"confidence": e.confidence
}
for e in entities["SKILL"]
]
# Process EDUCATION
if "EDUCATION" in entities:
profile["education"] = [
{
"degree": e.text.strip(),
"confidence": e.confidence
}
for e in entities["EDUCATION"]
]
# Calculate quality score
profile["quality_score"] = self._calculate_quality_score(profile)
return profile
def _calculate_quality_score(self, profile: Dict) -> float:
"""
Calculate extraction quality score (0-100)
Based on: name, email, job titles, companies, skills
"""
score = 0
max_score = 100
weights = {
"name": 20,
"emails": 20,
"job_titles": 20,
"companies": 20,
"skills": 20,
}
if profile.get("name"):
score += weights["name"]
if profile.get("emails"):
score += weights["emails"]
if profile.get("job_titles"):
score += weights["job_titles"]
if profile.get("companies"):
score += weights["companies"]
if profile.get("skills"):
score += weights["skills"]
return min(score, max_score)
def extract_with_fallback(self, text: str, fallback_extractor=None) -> Dict:
"""
Extract using NER, fallback to other extractor if needed
Args:
text: Resume text
fallback_extractor: Optional fallback (e.g., SkillExtractor)
Returns:
Combined extraction results
"""
# Try NER extraction first
ner_results = self.extract_structured_profile(text)
# If quality is low, try fallback
if fallback_extractor and ner_results.get("quality_score", 0) < 50:
print("⚠️ NER quality low, attempting fallback extraction...")
# Could merge with other extractors here
return ner_results
# ============================================================================
# USAGE EXAMPLES
# ============================================================================
if __name__ == "__main__":
# Example 1: Basic usage
print("\n" + "="*60)
print("EXAMPLE 1: Basic Entity Extraction")
print("="*60)
extractor = ResumeNERExtractor()
sample_text = """
John Smith
john.smith@gmail.com
Tel: +33612345678
Professional Experience:
Senior Python Developer at Google (2020-2023)
Full Stack Engineer at Amazon (2018-2020)
Skills: Python, FastAPI, React, Docker, Kubernetes, AWS
Education:
Bachelor of Science in Computer Science
University of California, Berkeley
"""
# Extract all entities
entities = extractor.extract_all_entities(sample_text)
print("\nExtracted Entities:")
for entity_type, entity_list in entities.items():
print(f"\n{entity_type}:")
for entity in entity_list:
print(f" - {entity.text:40} (confidence: {entity.confidence:.2f})")
# Example 2: Structured profile
print("\n" + "="*60)
print("EXAMPLE 2: Structured Profile Extraction")
print("="*60)
profile = extractor.extract_structured_profile(sample_text)
import json
print(json.dumps(profile, indent=2))
# Example 3: Read from file
print("\n" + "="*60)
print("EXAMPLE 3: Extract from File")
print("="*60)
try:
with open("backend/test_cv.txt", "r", encoding="utf-8") as f:
cv_text = f.read()
profile = extractor.extract_structured_profile(cv_text)
print(f"\nQuality Score: {profile['quality_score']:.1f}/100")
print(f"Name: {profile['name']}")
print(f"Emails: {profile['emails']}")
print(f"Skills found: {len(profile['skills'])}")
print(f"Job titles: {len(profile['job_titles'])}")
except FileNotFoundError:
print("⚠️ test_cv.txt not found")