| |
| |
|
|
| """ |
| Resume Parsing using BERT-based Named Entity Recognition |
| Modèle: AventIQ-AI/Resume-Parsing-NER-AI-Model |
| Utile pour: Extraction complète de CV (nom, email, compétences, expérience, etc.) |
| """ |
|
|
| import re |
| from typing import List, Dict, Optional, Tuple |
| from dataclasses import dataclass |
|
|
| try: |
| from transformers import pipeline |
| TRANSFORMERS_AVAILABLE = True |
| except ImportError: |
| TRANSFORMERS_AVAILABLE = False |
| print("⚠️ Warning: transformers not installed. Install with: pip install transformers torch") |
|
|
|
|
| @dataclass |
| class ExtractedEntity: |
| """Container for a single extracted entity""" |
| text: str |
| entity_type: str |
| confidence: float |
| start_char: int = 0 |
| end_char: int = 0 |
|
|
|
|
| class ResumeNERExtractor: |
| """ |
| Extract structured data from resume using BERT-based NER model |
| |
| Supported entity types: |
| - NAME: Candidate's full name |
| - EMAIL: Email addresses |
| - PHONE: Phone numbers |
| - JOB: Job titles (Senior Developer, Manager, etc.) |
| - COMPANY: Company names |
| - SKILL: Technical/soft skills |
| - EDUCATION: Educational degrees and qualifications |
| """ |
| |
| MODEL_NAME = "AventIQ-AI/Resume-Parsing-NER-AI-Model" |
| MIN_CONFIDENCE = 0.75 |
| MAX_TEXT_LENGTH = 512 |
| |
| |
| ENTITY_MAPPING = { |
| "B-NAME": "NAME", |
| "I-NAME": "NAME", |
| "B-EMAIL": "EMAIL", |
| "I-EMAIL": "EMAIL", |
| "B-PHONE": "PHONE", |
| "I-PHONE": "PHONE", |
| "B-EDUCATION": "EDUCATION", |
| "I-EDUCATION": "EDUCATION", |
| "B-SKILL": "SKILL", |
| "I-SKILL": "SKILL", |
| "B-JOB": "JOB", |
| "I-JOB": "JOB", |
| "B-COMPANY": "COMPANY", |
| "I-COMPANY": "COMPANY", |
| "O": "OTHER" |
| } |
| |
| def __init__(self): |
| """Initialize the NER pipeline""" |
| if not TRANSFORMERS_AVAILABLE: |
| raise ImportError( |
| "Transformers library required. Install with: pip install transformers torch" |
| ) |
| |
| print("📍 Loading Resume NER model... (first time may take a minute)") |
| try: |
| self.ner_pipeline = pipeline( |
| "ner", |
| model=self.MODEL_NAME, |
| aggregation_strategy="simple" |
| ) |
| print("✅ Model loaded successfully!") |
| except Exception as e: |
| print(f"❌ Error loading model: {e}") |
| raise |
| |
| def extract_all_entities(self, text: str) -> Dict[str, List[ExtractedEntity]]: |
| """ |
| Extract all entities from resume text |
| |
| Args: |
| text: Resume text (can be multi-paragraph) |
| |
| Returns: |
| Dictionary with entity types as keys and list of ExtractedEntity as values |
| { |
| "NAME": [ExtractedEntity(...), ...], |
| "EMAIL": [ExtractedEntity(...), ...], |
| "SKILL": [...], |
| ... |
| } |
| """ |
| if not text or len(text.strip()) == 0: |
| return {} |
| |
| |
| text = text[:self.MAX_TEXT_LENGTH] |
| |
| try: |
| |
| ner_results = self.ner_pipeline(text) |
| except Exception as e: |
| print(f"❌ NER extraction failed: {e}") |
| return {} |
| |
| |
| entities = self._parse_ner_results(ner_results, text) |
| |
| |
| grouped = {} |
| for entity in entities: |
| if entity.entity_type not in grouped: |
| grouped[entity.entity_type] = [] |
| grouped[entity.entity_type].append(entity) |
| |
| return grouped |
| |
| def _parse_ner_results(self, ner_results: list, original_text: str) -> List[ExtractedEntity]: |
| """ |
| Parse raw NER pipeline results into ExtractedEntity objects |
| |
| Args: |
| ner_results: Output from transformers NER pipeline |
| original_text: Original text (for position tracking) |
| |
| Returns: |
| List of ExtractedEntity objects |
| """ |
| entities = [] |
| current_entity = None |
| |
| for result in ner_results: |
| token = result["word"] |
| label = result["entity"] |
| score = result["score"] |
| |
| |
| entity_type = self.ENTITY_MAPPING.get(label, "OTHER") |
| |
| |
| if score < self.MIN_CONFIDENCE: |
| continue |
| |
| |
| if entity_type == "OTHER": |
| if current_entity: |
| entities.append(current_entity) |
| current_entity = None |
| continue |
| |
| |
| if label.startswith("B-"): |
| |
| if current_entity: |
| entities.append(current_entity) |
| |
| |
| current_entity = ExtractedEntity( |
| text=token, |
| entity_type=entity_type, |
| confidence=score |
| ) |
| |
| |
| elif label.startswith("I-") and current_entity: |
| |
| if current_entity.entity_type == entity_type: |
| |
| current_entity.text += f" {token}" if not token.startswith("##") else token.replace("##", "") |
| |
| current_entity.confidence = min(current_entity.confidence, score) |
| else: |
| |
| entities.append(current_entity) |
| current_entity = ExtractedEntity( |
| text=token, |
| entity_type=entity_type, |
| confidence=score |
| ) |
| |
| |
| if current_entity: |
| entities.append(current_entity) |
| |
| return entities |
| |
| def extract_structured_profile(self, text: str) -> Dict: |
| """ |
| Extract resume data into structured candidate profile |
| |
| Returns: |
| { |
| "name": str or None, |
| "emails": List[str], |
| "phones": List[str], |
| "job_titles": List[{"title": str, "confidence": float}], |
| "companies": List[{"name": str, "confidence": float}], |
| "skills": List[{"name": str, "confidence": float}], |
| "education": List[{"degree": str, "confidence": float}], |
| "quality_score": float (0-100), |
| "extraction_metadata": {...} |
| } |
| """ |
| |
| entities = self.extract_all_entities(text) |
| |
| |
| profile = { |
| "name": None, |
| "emails": [], |
| "phones": [], |
| "job_titles": [], |
| "companies": [], |
| "skills": [], |
| "education": [], |
| "extraction_metadata": { |
| "total_entities_found": sum(len(v) for v in entities.values()), |
| "entity_types_found": list(entities.keys()), |
| "confidence_scores": {} |
| } |
| } |
| |
| |
| if "NAME" in entities and entities["NAME"]: |
| name_entity = max(entities["NAME"], key=lambda e: e.confidence) |
| profile["name"] = name_entity.text.strip() |
| profile["extraction_metadata"]["confidence_scores"]["name"] = name_entity.confidence |
| |
| |
| if "EMAIL" in entities: |
| profile["emails"] = [ |
| e.text.strip() for e in entities["EMAIL"] |
| ] |
| |
| |
| if "PHONE" in entities: |
| profile["phones"] = [ |
| e.text.strip() for e in entities["PHONE"] |
| ] |
| |
| |
| if "JOB" in entities: |
| profile["job_titles"] = [ |
| { |
| "title": e.text.strip(), |
| "confidence": e.confidence |
| } |
| for e in entities["JOB"] |
| ] |
| |
| |
| if "COMPANY" in entities: |
| profile["companies"] = [ |
| { |
| "name": e.text.strip(), |
| "confidence": e.confidence |
| } |
| for e in entities["COMPANY"] |
| ] |
| |
| |
| if "SKILL" in entities: |
| profile["skills"] = [ |
| { |
| "name": e.text.strip(), |
| "confidence": e.confidence |
| } |
| for e in entities["SKILL"] |
| ] |
| |
| |
| if "EDUCATION" in entities: |
| profile["education"] = [ |
| { |
| "degree": e.text.strip(), |
| "confidence": e.confidence |
| } |
| for e in entities["EDUCATION"] |
| ] |
| |
| |
| profile["quality_score"] = self._calculate_quality_score(profile) |
| |
| return profile |
| |
| def _calculate_quality_score(self, profile: Dict) -> float: |
| """ |
| Calculate extraction quality score (0-100) |
| Based on: name, email, job titles, companies, skills |
| """ |
| score = 0 |
| max_score = 100 |
| |
| weights = { |
| "name": 20, |
| "emails": 20, |
| "job_titles": 20, |
| "companies": 20, |
| "skills": 20, |
| } |
| |
| if profile.get("name"): |
| score += weights["name"] |
| if profile.get("emails"): |
| score += weights["emails"] |
| if profile.get("job_titles"): |
| score += weights["job_titles"] |
| if profile.get("companies"): |
| score += weights["companies"] |
| if profile.get("skills"): |
| score += weights["skills"] |
| |
| return min(score, max_score) |
| |
| def extract_with_fallback(self, text: str, fallback_extractor=None) -> Dict: |
| """ |
| Extract using NER, fallback to other extractor if needed |
| |
| Args: |
| text: Resume text |
| fallback_extractor: Optional fallback (e.g., SkillExtractor) |
| |
| Returns: |
| Combined extraction results |
| """ |
| |
| ner_results = self.extract_structured_profile(text) |
| |
| |
| if fallback_extractor and ner_results.get("quality_score", 0) < 50: |
| print("⚠️ NER quality low, attempting fallback extraction...") |
| |
| |
| return ner_results |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| |
| print("\n" + "="*60) |
| print("EXAMPLE 1: Basic Entity Extraction") |
| print("="*60) |
| |
| extractor = ResumeNERExtractor() |
| |
| sample_text = """ |
| John Smith |
| john.smith@gmail.com |
| Tel: +33612345678 |
| |
| Professional Experience: |
| Senior Python Developer at Google (2020-2023) |
| Full Stack Engineer at Amazon (2018-2020) |
| |
| Skills: Python, FastAPI, React, Docker, Kubernetes, AWS |
| |
| Education: |
| Bachelor of Science in Computer Science |
| University of California, Berkeley |
| """ |
| |
| |
| entities = extractor.extract_all_entities(sample_text) |
| |
| print("\nExtracted Entities:") |
| for entity_type, entity_list in entities.items(): |
| print(f"\n{entity_type}:") |
| for entity in entity_list: |
| print(f" - {entity.text:40} (confidence: {entity.confidence:.2f})") |
| |
| |
| print("\n" + "="*60) |
| print("EXAMPLE 2: Structured Profile Extraction") |
| print("="*60) |
| |
| profile = extractor.extract_structured_profile(sample_text) |
| |
| import json |
| print(json.dumps(profile, indent=2)) |
| |
| |
| print("\n" + "="*60) |
| print("EXAMPLE 3: Extract from File") |
| print("="*60) |
| |
| try: |
| with open("backend/test_cv.txt", "r", encoding="utf-8") as f: |
| cv_text = f.read() |
| |
| profile = extractor.extract_structured_profile(cv_text) |
| print(f"\nQuality Score: {profile['quality_score']:.1f}/100") |
| print(f"Name: {profile['name']}") |
| print(f"Emails: {profile['emails']}") |
| print(f"Skills found: {len(profile['skills'])}") |
| print(f"Job titles: {len(profile['job_titles'])}") |
| except FileNotFoundError: |
| print("⚠️ test_cv.txt not found") |
|
|