"""Text and skill normalization helpers used across extraction, matching and training.""" from __future__ import annotations import re _WHITESPACE_RE = re.compile(r"\s+") _PUNCT_RE = re.compile(r"[\t\r\n]+") _SKILL_ALIASES = { "ml": "Machine Learning", "machine learning": "Machine Learning", "ai": "Artificial Intelligence", "artificial intelligence": "Artificial Intelligence", "nlp": "Natural Language Processing", "natural language processing": "Natural Language Processing", "js": "JavaScript", "javascript": "JavaScript", "ts": "TypeScript", "typescript": "TypeScript", "sql": "SQL", "python": "Python", "fastapi": "FastAPI", "docker": "Docker", "aws": "AWS", "devops": "DevOps", "react": "React", "node js": "Node.js", "nodejs": "Node.js", "pandas": "Pandas", "scikit learn": "Scikit-Learn", } def normalize_text(value: str | None) -> str: if not value: return "" text = str(value).replace("\u00a0", " ") text = _PUNCT_RE.sub(" ", text) text = _WHITESPACE_RE.sub(" ", text).strip() return text def normalize_skill_name(value: str | None) -> str: text = normalize_text(value) if not text: return "" text = text.replace("/", " ") text = _WHITESPACE_RE.sub(" ", text) normalized = text.strip().lower() return _SKILL_ALIASES.get(normalized, text.title()) def normalize_company_name(value: str | None) -> str: text = normalize_text(value) return text.strip(" ,.-") def normalize_job_title(value: str | None) -> str: text = normalize_text(value) return text[:120] def compact_join(values: list[str] | tuple[str, ...] | None) -> str: if not values: return "" return normalize_text(" ".join(normalize_text(item) for item in values if normalize_text(item)))