Spaces:

RHmaster
/

ai-talent-finder-backend

Sleeping

ai-talent-finder-backend / ai_module /nlp /cv_cleaner.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 11 days ago

3.3 kB

	"""
	CV text cleaning and preprocessing
	Étape 6 - NLP preprocessing
	"""

	import re
	from typing import List


	class CVCleaner:
	"""Clean and preprocess CV text"""

	@staticmethod
	def clean_text(text: str) -> str:
	"""
	Clean and normalize CV text

	Args:
	text: Raw CV text

	Returns:
	Cleaned text
	"""
	if not text:
	return ""

	# 1. Remove extra whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	# 2. Remove URLs
	text = re.sub(r'http\S+\|www\S+', '', text)

	# 3. Remove email patterns
	text = re.sub(r'[\w\.-]+@[\w\.-]+\.\w+', '', text)

	# 4. Remove phone numbers
	text = re.sub(r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b', '', text)

	# 5. Remove special characters but keep spaces
	text = re.sub(r'[^a-zA-Z0-9\s\-]', '', text)

	return text

	@staticmethod
	def extract_sections(text: str) -> dict:
	"""
	Try to extract CV sections (experience, education, skills, etc.)

	Args:
	text: CV text

	Returns:
	Dict with identified sections
	"""
	sections = {
	"experience": "",
	"education": "",
	"skills": "",
	"summary": ""
	}

	# Common section headers (case-insensitive)
	exp_patterns = [r'(?:professional\s+)?experience', r'work\s+history', r'employment history']
	edu_patterns = [r'education', r'academic background', r'qualifications']
	skill_patterns = [r'skills?', r'competencies', r'technical skills']
	summary_patterns = [r'summary', r'objective', r'professional summary', r'about me']

	text_lower = text.lower()

	# Split by section headers
	current_section = "summary"
	current_text = ""

	for line in text.split('\n'):
	line_lower = line.lower()

	# Check for section headers
	is_exp = any(re.search(pattern, line_lower) for pattern in exp_patterns)
	is_edu = any(re.search(pattern, line_lower) for pattern in edu_patterns)
	is_skill = any(re.search(pattern, line_lower) for pattern in skill_patterns)
	is_summary = any(re.search(pattern, line_lower) for pattern in summary_patterns)

	if is_exp:
	sections[current_section] = current_text
	current_section = "experience"
	current_text = ""
	elif is_edu:
	sections[current_section] = current_text
	current_section = "education"
	current_text = ""
	elif is_skill:
	sections[current_section] = current_text
	current_section = "skills"
	current_text = ""
	elif is_summary:
	sections[current_section] = current_text
	current_section = "summary"
	current_text = ""
	else:
	current_text += " " + line

	sections[current_section] = current_text

	return sections