Spaces:
Sleeping
Sleeping
| # Resume NER Extractor - Implémentation Prête à l'Emploi | |
| # Fichier: backend/ai_module/nlp/resume_ner_extractor.py | |
| """ | |
| Resume Parsing using BERT-based Named Entity Recognition | |
| Modèle: AventIQ-AI/Resume-Parsing-NER-AI-Model | |
| Utile pour: Extraction complète de CV (nom, email, compétences, expérience, etc.) | |
| """ | |
| import re | |
| from typing import List, Dict, Optional, Tuple | |
| from dataclasses import dataclass | |
| try: | |
| from transformers import pipeline | |
| TRANSFORMERS_AVAILABLE = True | |
| except ImportError: | |
| TRANSFORMERS_AVAILABLE = False | |
| print("⚠️ Warning: transformers not installed. Install with: pip install transformers torch") | |
| class ExtractedEntity: | |
| """Container for a single extracted entity""" | |
| text: str | |
| entity_type: str # NAME, EMAIL, PHONE, JOB, COMPANY, SKILL, EDUCATION | |
| confidence: float | |
| start_char: int = 0 | |
| end_char: int = 0 | |
| class ResumeNERExtractor: | |
| """ | |
| Extract structured data from resume using BERT-based NER model | |
| Supported entity types: | |
| - NAME: Candidate's full name | |
| - EMAIL: Email addresses | |
| - PHONE: Phone numbers | |
| - JOB: Job titles (Senior Developer, Manager, etc.) | |
| - COMPANY: Company names | |
| - SKILL: Technical/soft skills | |
| - EDUCATION: Educational degrees and qualifications | |
| """ | |
| MODEL_NAME = "AventIQ-AI/Resume-Parsing-NER-AI-Model" | |
| MIN_CONFIDENCE = 0.75 # Only extract if confidence > 75% | |
| MAX_TEXT_LENGTH = 512 # BERT max input | |
| # Mapping from model labels to normalized categories | |
| ENTITY_MAPPING = { | |
| "B-NAME": "NAME", | |
| "I-NAME": "NAME", | |
| "B-EMAIL": "EMAIL", | |
| "I-EMAIL": "EMAIL", | |
| "B-PHONE": "PHONE", | |
| "I-PHONE": "PHONE", | |
| "B-EDUCATION": "EDUCATION", | |
| "I-EDUCATION": "EDUCATION", | |
| "B-SKILL": "SKILL", | |
| "I-SKILL": "SKILL", | |
| "B-JOB": "JOB", | |
| "I-JOB": "JOB", | |
| "B-COMPANY": "COMPANY", | |
| "I-COMPANY": "COMPANY", | |
| "O": "OTHER" | |
| } | |
| def __init__(self): | |
| """Initialize the NER pipeline""" | |
| if not TRANSFORMERS_AVAILABLE: | |
| raise ImportError( | |
| "Transformers library required. Install with: pip install transformers torch" | |
| ) | |
| print("📍 Loading Resume NER model... (first time may take a minute)") | |
| try: | |
| self.ner_pipeline = pipeline( | |
| "ner", | |
| model=self.MODEL_NAME, | |
| aggregation_strategy="simple" # Keep subwords separate | |
| ) | |
| print("✅ Model loaded successfully!") | |
| except Exception as e: | |
| print(f"❌ Error loading model: {e}") | |
| raise | |
| def extract_all_entities(self, text: str) -> Dict[str, List[ExtractedEntity]]: | |
| """ | |
| Extract all entities from resume text | |
| Args: | |
| text: Resume text (can be multi-paragraph) | |
| Returns: | |
| Dictionary with entity types as keys and list of ExtractedEntity as values | |
| { | |
| "NAME": [ExtractedEntity(...), ...], | |
| "EMAIL": [ExtractedEntity(...), ...], | |
| "SKILL": [...], | |
| ... | |
| } | |
| """ | |
| if not text or len(text.strip()) == 0: | |
| return {} | |
| # Truncate to max length | |
| text = text[:self.MAX_TEXT_LENGTH] | |
| try: | |
| # Run NER pipeline | |
| ner_results = self.ner_pipeline(text) | |
| except Exception as e: | |
| print(f"❌ NER extraction failed: {e}") | |
| return {} | |
| # Parse results into structured format | |
| entities = self._parse_ner_results(ner_results, text) | |
| # Group by entity type | |
| grouped = {} | |
| for entity in entities: | |
| if entity.entity_type not in grouped: | |
| grouped[entity.entity_type] = [] | |
| grouped[entity.entity_type].append(entity) | |
| return grouped | |
| def _parse_ner_results(self, ner_results: list, original_text: str) -> List[ExtractedEntity]: | |
| """ | |
| Parse raw NER pipeline results into ExtractedEntity objects | |
| Args: | |
| ner_results: Output from transformers NER pipeline | |
| original_text: Original text (for position tracking) | |
| Returns: | |
| List of ExtractedEntity objects | |
| """ | |
| entities = [] | |
| current_entity = None | |
| for result in ner_results: | |
| token = result["word"] | |
| label = result["entity"] | |
| score = result["score"] | |
| # Normalize label | |
| entity_type = self.ENTITY_MAPPING.get(label, "OTHER") | |
| # Skip if confidence too low | |
| if score < self.MIN_CONFIDENCE: | |
| continue | |
| # Skip "other" entities | |
| if entity_type == "OTHER": | |
| if current_entity: | |
| entities.append(current_entity) | |
| current_entity = None | |
| continue | |
| # Handle B- (Beginning) tags | |
| if label.startswith("B-"): | |
| # Save previous entity if exists | |
| if current_entity: | |
| entities.append(current_entity) | |
| # Start new entity | |
| current_entity = ExtractedEntity( | |
| text=token, | |
| entity_type=entity_type, | |
| confidence=score | |
| ) | |
| # Handle I- (Inside/continuation) tags | |
| elif label.startswith("I-") and current_entity: | |
| # Continue current entity | |
| if current_entity.entity_type == entity_type: | |
| # Merge with space if needed | |
| current_entity.text += f" {token}" if not token.startswith("##") else token.replace("##", "") | |
| # Use minimum confidence | |
| current_entity.confidence = min(current_entity.confidence, score) | |
| else: | |
| # Entity type changed, save previous | |
| entities.append(current_entity) | |
| current_entity = ExtractedEntity( | |
| text=token, | |
| entity_type=entity_type, | |
| confidence=score | |
| ) | |
| # Don't forget last entity | |
| if current_entity: | |
| entities.append(current_entity) | |
| return entities | |
| def extract_structured_profile(self, text: str) -> Dict: | |
| """ | |
| Extract resume data into structured candidate profile | |
| Returns: | |
| { | |
| "name": str or None, | |
| "emails": List[str], | |
| "phones": List[str], | |
| "job_titles": List[{"title": str, "confidence": float}], | |
| "companies": List[{"name": str, "confidence": float}], | |
| "skills": List[{"name": str, "confidence": float}], | |
| "education": List[{"degree": str, "confidence": float}], | |
| "quality_score": float (0-100), | |
| "extraction_metadata": {...} | |
| } | |
| """ | |
| # Extract all entities | |
| entities = self.extract_all_entities(text) | |
| # Build structured profile | |
| profile = { | |
| "name": None, | |
| "emails": [], | |
| "phones": [], | |
| "job_titles": [], | |
| "companies": [], | |
| "skills": [], | |
| "education": [], | |
| "extraction_metadata": { | |
| "total_entities_found": sum(len(v) for v in entities.values()), | |
| "entity_types_found": list(entities.keys()), | |
| "confidence_scores": {} | |
| } | |
| } | |
| # Process NAME | |
| if "NAME" in entities and entities["NAME"]: | |
| name_entity = max(entities["NAME"], key=lambda e: e.confidence) | |
| profile["name"] = name_entity.text.strip() | |
| profile["extraction_metadata"]["confidence_scores"]["name"] = name_entity.confidence | |
| # Process EMAIL | |
| if "EMAIL" in entities: | |
| profile["emails"] = [ | |
| e.text.strip() for e in entities["EMAIL"] | |
| ] | |
| # Process PHONE | |
| if "PHONE" in entities: | |
| profile["phones"] = [ | |
| e.text.strip() for e in entities["PHONE"] | |
| ] | |
| # Process JOB TITLES | |
| if "JOB" in entities: | |
| profile["job_titles"] = [ | |
| { | |
| "title": e.text.strip(), | |
| "confidence": e.confidence | |
| } | |
| for e in entities["JOB"] | |
| ] | |
| # Process COMPANIES | |
| if "COMPANY" in entities: | |
| profile["companies"] = [ | |
| { | |
| "name": e.text.strip(), | |
| "confidence": e.confidence | |
| } | |
| for e in entities["COMPANY"] | |
| ] | |
| # Process SKILLS | |
| if "SKILL" in entities: | |
| profile["skills"] = [ | |
| { | |
| "name": e.text.strip(), | |
| "confidence": e.confidence | |
| } | |
| for e in entities["SKILL"] | |
| ] | |
| # Process EDUCATION | |
| if "EDUCATION" in entities: | |
| profile["education"] = [ | |
| { | |
| "degree": e.text.strip(), | |
| "confidence": e.confidence | |
| } | |
| for e in entities["EDUCATION"] | |
| ] | |
| # Calculate quality score | |
| profile["quality_score"] = self._calculate_quality_score(profile) | |
| return profile | |
| def _calculate_quality_score(self, profile: Dict) -> float: | |
| """ | |
| Calculate extraction quality score (0-100) | |
| Based on: name, email, job titles, companies, skills | |
| """ | |
| score = 0 | |
| max_score = 100 | |
| weights = { | |
| "name": 20, | |
| "emails": 20, | |
| "job_titles": 20, | |
| "companies": 20, | |
| "skills": 20, | |
| } | |
| if profile.get("name"): | |
| score += weights["name"] | |
| if profile.get("emails"): | |
| score += weights["emails"] | |
| if profile.get("job_titles"): | |
| score += weights["job_titles"] | |
| if profile.get("companies"): | |
| score += weights["companies"] | |
| if profile.get("skills"): | |
| score += weights["skills"] | |
| return min(score, max_score) | |
| def extract_with_fallback(self, text: str, fallback_extractor=None) -> Dict: | |
| """ | |
| Extract using NER, fallback to other extractor if needed | |
| Args: | |
| text: Resume text | |
| fallback_extractor: Optional fallback (e.g., SkillExtractor) | |
| Returns: | |
| Combined extraction results | |
| """ | |
| # Try NER extraction first | |
| ner_results = self.extract_structured_profile(text) | |
| # If quality is low, try fallback | |
| if fallback_extractor and ner_results.get("quality_score", 0) < 50: | |
| print("⚠️ NER quality low, attempting fallback extraction...") | |
| # Could merge with other extractors here | |
| return ner_results | |
| # ============================================================================ | |
| # USAGE EXAMPLES | |
| # ============================================================================ | |
| if __name__ == "__main__": | |
| # Example 1: Basic usage | |
| print("\n" + "="*60) | |
| print("EXAMPLE 1: Basic Entity Extraction") | |
| print("="*60) | |
| extractor = ResumeNERExtractor() | |
| sample_text = """ | |
| John Smith | |
| john.smith@gmail.com | |
| Tel: +33612345678 | |
| Professional Experience: | |
| Senior Python Developer at Google (2020-2023) | |
| Full Stack Engineer at Amazon (2018-2020) | |
| Skills: Python, FastAPI, React, Docker, Kubernetes, AWS | |
| Education: | |
| Bachelor of Science in Computer Science | |
| University of California, Berkeley | |
| """ | |
| # Extract all entities | |
| entities = extractor.extract_all_entities(sample_text) | |
| print("\nExtracted Entities:") | |
| for entity_type, entity_list in entities.items(): | |
| print(f"\n{entity_type}:") | |
| for entity in entity_list: | |
| print(f" - {entity.text:40} (confidence: {entity.confidence:.2f})") | |
| # Example 2: Structured profile | |
| print("\n" + "="*60) | |
| print("EXAMPLE 2: Structured Profile Extraction") | |
| print("="*60) | |
| profile = extractor.extract_structured_profile(sample_text) | |
| import json | |
| print(json.dumps(profile, indent=2)) | |
| # Example 3: Read from file | |
| print("\n" + "="*60) | |
| print("EXAMPLE 3: Extract from File") | |
| print("="*60) | |
| try: | |
| with open("backend/test_cv.txt", "r", encoding="utf-8") as f: | |
| cv_text = f.read() | |
| profile = extractor.extract_structured_profile(cv_text) | |
| print(f"\nQuality Score: {profile['quality_score']:.1f}/100") | |
| print(f"Name: {profile['name']}") | |
| print(f"Emails: {profile['emails']}") | |
| print(f"Skills found: {len(profile['skills'])}") | |
| print(f"Job titles: {len(profile['job_titles'])}") | |
| except FileNotFoundError: | |
| print("⚠️ test_cv.txt not found") | |