Spaces:

RHmaster
/

ai-talent-finder-backend

Sleeping

File size: 3,299 Bytes

9df97a2

"""
CV text cleaning and preprocessing
Étape 6 - NLP preprocessing
"""

import re
from typing import List


class CVCleaner:
    """Clean and preprocess CV text"""
    
    @staticmethod
    def clean_text(text: str) -> str:
        """
        Clean and normalize CV text
        
        Args:
            text: Raw CV text
        
        Returns:
            Cleaned text
        """
        if not text:
            return ""
        
        # 1. Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        # 2. Remove URLs
        text = re.sub(r'http\S+|www\S+', '', text)
        
        # 3. Remove email patterns
        text = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', '', text)
        
        # 4. Remove phone numbers
        text = re.sub(r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b', '', text)
        
        # 5. Remove special characters but keep spaces
        text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text)
        
        return text
    
    @staticmethod
    def extract_sections(text: str) -> dict:
        """
        Try to extract CV sections (experience, education, skills, etc.)
        
        Args:
            text: CV text
        
        Returns:
            Dict with identified sections
        """
        sections = {
            "experience": "",
            "education": "",
            "skills": "",
            "summary": ""
        }
        
        # Common section headers (case-insensitive)
        exp_patterns = [r'(?:professional\s+)?experience', r'work\s+history', r'employment history']
        edu_patterns = [r'education', r'academic background', r'qualifications']
        skill_patterns = [r'skills?', r'competencies', r'technical skills']
        summary_patterns = [r'summary', r'objective', r'professional summary', r'about me']
        
        text_lower = text.lower()
        
        # Split by section headers
        current_section = "summary"
        current_text = ""
        
        for line in text.split('\n'):
            line_lower = line.lower()
            
            # Check for section headers
            is_exp = any(re.search(pattern, line_lower) for pattern in exp_patterns)
            is_edu = any(re.search(pattern, line_lower) for pattern in edu_patterns)
            is_skill = any(re.search(pattern, line_lower) for pattern in skill_patterns)
            is_summary = any(re.search(pattern, line_lower) for pattern in summary_patterns)
            
            if is_exp:
                sections[current_section] = current_text
                current_section = "experience"
                current_text = ""
            elif is_edu:
                sections[current_section] = current_text
                current_section = "education"
                current_text = ""
            elif is_skill:
                sections[current_section] = current_text
                current_section = "skills"
                current_text = ""
            elif is_summary:
                sections[current_section] = current_text
                current_section = "summary"
                current_text = ""
            else:
                current_text += " " + line
        
        sections[current_section] = current_text
        
        return sections