Spaces:

RHmaster
/

ai-talent-finder-backend

Sleeping

File size: 13,411 Bytes

9df97a2

# Resume NER Extractor - Implémentation Prête à l'Emploi
# Fichier: backend/ai_module/nlp/resume_ner_extractor.py

"""
Resume Parsing using BERT-based Named Entity Recognition
Modèle: AventIQ-AI/Resume-Parsing-NER-AI-Model
Utile pour: Extraction complète de CV (nom, email, compétences, expérience, etc.)
"""

import re
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass

try:
    from transformers import pipeline
    TRANSFORMERS_AVAILABLE = True
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("⚠️ Warning: transformers not installed. Install with: pip install transformers torch")


@dataclass
class ExtractedEntity:
    """Container for a single extracted entity"""
    text: str
    entity_type: str  # NAME, EMAIL, PHONE, JOB, COMPANY, SKILL, EDUCATION
    confidence: float
    start_char: int = 0
    end_char: int = 0


class ResumeNERExtractor:
    """
    Extract structured data from resume using BERT-based NER model
    
    Supported entity types:
    - NAME: Candidate's full name
    - EMAIL: Email addresses
    - PHONE: Phone numbers
    - JOB: Job titles (Senior Developer, Manager, etc.)
    - COMPANY: Company names
    - SKILL: Technical/soft skills
    - EDUCATION: Educational degrees and qualifications
    """
    
    MODEL_NAME = "AventIQ-AI/Resume-Parsing-NER-AI-Model"
    MIN_CONFIDENCE = 0.75  # Only extract if confidence > 75%
    MAX_TEXT_LENGTH = 512  # BERT max input
    
    # Mapping from model labels to normalized categories
    ENTITY_MAPPING = {
        "B-NAME": "NAME",
        "I-NAME": "NAME",
        "B-EMAIL": "EMAIL",
        "I-EMAIL": "EMAIL",
        "B-PHONE": "PHONE",
        "I-PHONE": "PHONE",
        "B-EDUCATION": "EDUCATION",
        "I-EDUCATION": "EDUCATION",
        "B-SKILL": "SKILL",
        "I-SKILL": "SKILL",
        "B-JOB": "JOB",
        "I-JOB": "JOB",
        "B-COMPANY": "COMPANY",
        "I-COMPANY": "COMPANY",
        "O": "OTHER"
    }
    
    def __init__(self):
        """Initialize the NER pipeline"""
        if not TRANSFORMERS_AVAILABLE:
            raise ImportError(
                "Transformers library required. Install with: pip install transformers torch"
            )
        
        print("📍 Loading Resume NER model... (first time may take a minute)")
        try:
            self.ner_pipeline = pipeline(
                "ner",
                model=self.MODEL_NAME,
                aggregation_strategy="simple"  # Keep subwords separate
            )
            print("✅ Model loaded successfully!")
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            raise
    
    def extract_all_entities(self, text: str) -> Dict[str, List[ExtractedEntity]]:
        """
        Extract all entities from resume text
        
        Args:
            text: Resume text (can be multi-paragraph)
        
        Returns:
            Dictionary with entity types as keys and list of ExtractedEntity as values
            {
                "NAME": [ExtractedEntity(...), ...],
                "EMAIL": [ExtractedEntity(...), ...],
                "SKILL": [...],
                ...
            }
        """
        if not text or len(text.strip()) == 0:
            return {}
        
        # Truncate to max length
        text = text[:self.MAX_TEXT_LENGTH]
        
        try:
            # Run NER pipeline
            ner_results = self.ner_pipeline(text)
        except Exception as e:
            print(f"❌ NER extraction failed: {e}")
            return {}
        
        # Parse results into structured format
        entities = self._parse_ner_results(ner_results, text)
        
        # Group by entity type
        grouped = {}
        for entity in entities:
            if entity.entity_type not in grouped:
                grouped[entity.entity_type] = []
            grouped[entity.entity_type].append(entity)
        
        return grouped
    
    def _parse_ner_results(self, ner_results: list, original_text: str) -> List[ExtractedEntity]:
        """
        Parse raw NER pipeline results into ExtractedEntity objects
        
        Args:
            ner_results: Output from transformers NER pipeline
            original_text: Original text (for position tracking)
        
        Returns:
            List of ExtractedEntity objects
        """
        entities = []
        current_entity = None
        
        for result in ner_results:
            token = result["word"]
            label = result["entity"]
            score = result["score"]
            
            # Normalize label
            entity_type = self.ENTITY_MAPPING.get(label, "OTHER")
            
            # Skip if confidence too low
            if score < self.MIN_CONFIDENCE:
                continue
            
            # Skip "other" entities
            if entity_type == "OTHER":
                if current_entity:
                    entities.append(current_entity)
                    current_entity = None
                continue
            
            # Handle B- (Beginning) tags
            if label.startswith("B-"):
                # Save previous entity if exists
                if current_entity:
                    entities.append(current_entity)
                
                # Start new entity
                current_entity = ExtractedEntity(
                    text=token,
                    entity_type=entity_type,
                    confidence=score
                )
            
            # Handle I- (Inside/continuation) tags
            elif label.startswith("I-") and current_entity:
                # Continue current entity
                if current_entity.entity_type == entity_type:
                    # Merge with space if needed
                    current_entity.text += f" {token}" if not token.startswith("##") else token.replace("##", "")
                    # Use minimum confidence
                    current_entity.confidence = min(current_entity.confidence, score)
                else:
                    # Entity type changed, save previous
                    entities.append(current_entity)
                    current_entity = ExtractedEntity(
                        text=token,
                        entity_type=entity_type,
                        confidence=score
                    )
        
        # Don't forget last entity
        if current_entity:
            entities.append(current_entity)
        
        return entities
    
    def extract_structured_profile(self, text: str) -> Dict:
        """
        Extract resume data into structured candidate profile
        
        Returns:
            {
                "name": str or None,
                "emails": List[str],
                "phones": List[str],
                "job_titles": List[{"title": str, "confidence": float}],
                "companies": List[{"name": str, "confidence": float}],
                "skills": List[{"name": str, "confidence": float}],
                "education": List[{"degree": str, "confidence": float}],
                "quality_score": float (0-100),
                "extraction_metadata": {...}
            }
        """
        # Extract all entities
        entities = self.extract_all_entities(text)
        
        # Build structured profile
        profile = {
            "name": None,
            "emails": [],
            "phones": [],
            "job_titles": [],
            "companies": [],
            "skills": [],
            "education": [],
            "extraction_metadata": {
                "total_entities_found": sum(len(v) for v in entities.values()),
                "entity_types_found": list(entities.keys()),
                "confidence_scores": {}
            }
        }
        
        # Process NAME
        if "NAME" in entities and entities["NAME"]:
            name_entity = max(entities["NAME"], key=lambda e: e.confidence)
            profile["name"] = name_entity.text.strip()
            profile["extraction_metadata"]["confidence_scores"]["name"] = name_entity.confidence
        
        # Process EMAIL
        if "EMAIL" in entities:
            profile["emails"] = [
                e.text.strip() for e in entities["EMAIL"]
            ]
        
        # Process PHONE
        if "PHONE" in entities:
            profile["phones"] = [
                e.text.strip() for e in entities["PHONE"]
            ]
        
        # Process JOB TITLES
        if "JOB" in entities:
            profile["job_titles"] = [
                {
                    "title": e.text.strip(),
                    "confidence": e.confidence
                }
                for e in entities["JOB"]
            ]
        
        # Process COMPANIES
        if "COMPANY" in entities:
            profile["companies"] = [
                {
                    "name": e.text.strip(),
                    "confidence": e.confidence
                }
                for e in entities["COMPANY"]
            ]
        
        # Process SKILLS
        if "SKILL" in entities:
            profile["skills"] = [
                {
                    "name": e.text.strip(),
                    "confidence": e.confidence
                }
                for e in entities["SKILL"]
            ]
        
        # Process EDUCATION
        if "EDUCATION" in entities:
            profile["education"] = [
                {
                    "degree": e.text.strip(),
                    "confidence": e.confidence
                }
                for e in entities["EDUCATION"]
            ]
        
        # Calculate quality score
        profile["quality_score"] = self._calculate_quality_score(profile)
        
        return profile
    
    def _calculate_quality_score(self, profile: Dict) -> float:
        """
        Calculate extraction quality score (0-100)
        Based on: name, email, job titles, companies, skills
        """
        score = 0
        max_score = 100
        
        weights = {
            "name": 20,
            "emails": 20,
            "job_titles": 20,
            "companies": 20,
            "skills": 20,
        }
        
        if profile.get("name"):
            score += weights["name"]
        if profile.get("emails"):
            score += weights["emails"]
        if profile.get("job_titles"):
            score += weights["job_titles"]
        if profile.get("companies"):
            score += weights["companies"]
        if profile.get("skills"):
            score += weights["skills"]
        
        return min(score, max_score)
    
    def extract_with_fallback(self, text: str, fallback_extractor=None) -> Dict:
        """
        Extract using NER, fallback to other extractor if needed
        
        Args:
            text: Resume text
            fallback_extractor: Optional fallback (e.g., SkillExtractor)
        
        Returns:
            Combined extraction results
        """
        # Try NER extraction first
        ner_results = self.extract_structured_profile(text)
        
        # If quality is low, try fallback
        if fallback_extractor and ner_results.get("quality_score", 0) < 50:
            print("⚠️ NER quality low, attempting fallback extraction...")
            # Could merge with other extractors here
        
        return ner_results


# ============================================================================
# USAGE EXAMPLES
# ============================================================================

if __name__ == "__main__":
    # Example 1: Basic usage
    print("\n" + "="*60)
    print("EXAMPLE 1: Basic Entity Extraction")
    print("="*60)
    
    extractor = ResumeNERExtractor()
    
    sample_text = """
    John Smith
    john.smith@gmail.com
    Tel: +33612345678
    
    Professional Experience:
    Senior Python Developer at Google (2020-2023)
    Full Stack Engineer at Amazon (2018-2020)
    
    Skills: Python, FastAPI, React, Docker, Kubernetes, AWS
    
    Education:
    Bachelor of Science in Computer Science
    University of California, Berkeley
    """
    
    # Extract all entities
    entities = extractor.extract_all_entities(sample_text)
    
    print("\nExtracted Entities:")
    for entity_type, entity_list in entities.items():
        print(f"\n{entity_type}:")
        for entity in entity_list:
            print(f"  - {entity.text:40} (confidence: {entity.confidence:.2f})")
    
    # Example 2: Structured profile
    print("\n" + "="*60)
    print("EXAMPLE 2: Structured Profile Extraction")
    print("="*60)
    
    profile = extractor.extract_structured_profile(sample_text)
    
    import json
    print(json.dumps(profile, indent=2))
    
    # Example 3: Read from file
    print("\n" + "="*60)
    print("EXAMPLE 3: Extract from File")
    print("="*60)
    
    try:
        with open("backend/test_cv.txt", "r", encoding="utf-8") as f:
            cv_text = f.read()
        
        profile = extractor.extract_structured_profile(cv_text)
        print(f"\nQuality Score: {profile['quality_score']:.1f}/100")
        print(f"Name: {profile['name']}")
        print(f"Emails: {profile['emails']}")
        print(f"Skills found: {len(profile['skills'])}")
        print(f"Job titles: {len(profile['job_titles'])}")
    except FileNotFoundError:
        print("⚠️ test_cv.txt not found")