Spaces:
Sleeping
Sleeping
| """ | |
| CV text cleaning and preprocessing | |
| Étape 6 - NLP preprocessing | |
| """ | |
| import re | |
| from typing import List | |
| class CVCleaner: | |
| """Clean and preprocess CV text""" | |
| def clean_text(text: str) -> str: | |
| """ | |
| Clean and normalize CV text | |
| Args: | |
| text: Raw CV text | |
| Returns: | |
| Cleaned text | |
| """ | |
| if not text: | |
| return "" | |
| # 1. Remove extra whitespace | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| # 2. Remove URLs | |
| text = re.sub(r'http\S+|www\S+', '', text) | |
| # 3. Remove email patterns | |
| text = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', '', text) | |
| # 4. Remove phone numbers | |
| text = re.sub(r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b', '', text) | |
| # 5. Remove special characters but keep spaces | |
| text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text) | |
| return text | |
| def extract_sections(text: str) -> dict: | |
| """ | |
| Try to extract CV sections (experience, education, skills, etc.) | |
| Args: | |
| text: CV text | |
| Returns: | |
| Dict with identified sections | |
| """ | |
| sections = { | |
| "experience": "", | |
| "education": "", | |
| "skills": "", | |
| "summary": "" | |
| } | |
| # Common section headers (case-insensitive) | |
| exp_patterns = [r'(?:professional\s+)?experience', r'work\s+history', r'employment history'] | |
| edu_patterns = [r'education', r'academic background', r'qualifications'] | |
| skill_patterns = [r'skills?', r'competencies', r'technical skills'] | |
| summary_patterns = [r'summary', r'objective', r'professional summary', r'about me'] | |
| text_lower = text.lower() | |
| # Split by section headers | |
| current_section = "summary" | |
| current_text = "" | |
| for line in text.split('\n'): | |
| line_lower = line.lower() | |
| # Check for section headers | |
| is_exp = any(re.search(pattern, line_lower) for pattern in exp_patterns) | |
| is_edu = any(re.search(pattern, line_lower) for pattern in edu_patterns) | |
| is_skill = any(re.search(pattern, line_lower) for pattern in skill_patterns) | |
| is_summary = any(re.search(pattern, line_lower) for pattern in summary_patterns) | |
| if is_exp: | |
| sections[current_section] = current_text | |
| current_section = "experience" | |
| current_text = "" | |
| elif is_edu: | |
| sections[current_section] = current_text | |
| current_section = "education" | |
| current_text = "" | |
| elif is_skill: | |
| sections[current_section] = current_text | |
| current_section = "skills" | |
| current_text = "" | |
| elif is_summary: | |
| sections[current_section] = current_text | |
| current_section = "summary" | |
| current_text = "" | |
| else: | |
| current_text += " " + line | |
| sections[current_section] = current_text | |
| return sections | |