from pathlib import Path import json import re from typing import List, Dict, Any _BASE_DIR = Path(__file__).resolve().parents[2] _DATA_DIR = _BASE_DIR / "ai_module" / "data" _SKILLS_FILE = _DATA_DIR / "skills_dictionary.json" _MAPPINGS_FILE = _DATA_DIR / "skill_mappings.json" def _load_json(path: Path) -> Dict[str, Any]: if not path.exists(): return {} try: return json.loads(path.read_text(encoding="utf-8")) except Exception: return {} _SKILLS_DICT = _load_json(_SKILLS_FILE) _MAPPINGS = {k.lower(): v for k, v in _load_json(_MAPPINGS_FILE).items()} def normalize_skill(skill: str) -> str: """Normalize a single skill string to a canonical form. Steps: - lower + strip - basic punctuation cleanup - map via `skill_mappings.json` if available - if present in `skills_dictionary.json`, return the canonical spelling - otherwise return title-cased fallback """ if not skill: return "" s = skill.strip() s = s.replace("\n", " ") s = re.sub(r"[\(\)\[\]\\/\\]", " ", s) s = re.sub(r"[\.,;:]$", "", s) s_clean = re.sub(r"\s+", " ", s).strip() key = s_clean.lower() # direct mapping if key in _MAPPINGS: return _MAPPINGS[key] # search in skills dictionary (case-insensitive) for cat, items in _SKILLS_DICT.items(): for item in items: if item.lower() == key: return item # fuzzy-ish normalization: common replacements replacements = { "\bjs\b": "JavaScript", "\bnodejs\b": "Node.js", "\breactjs\b": "React", "\bdevops\b": "DevOps", } for pattern, repl in replacements.items(): if re.search(pattern, key): return repl # fallback: title case acronyms preserved if key.isupper() or len(key) <= 3: return s_clean.upper() return s_clean.title() def normalize_skills_list(skills: List[str]) -> List[str]: seen = set() out = [] for sk in skills: norm = normalize_skill(sk) if not norm: continue if norm not in seen: seen.add(norm) out.append(norm) return out _YEARS_RE = re.compile(r"(\d{1,2})(?:\+)?\s*(?:years|ans|yrs|year)", re.IGNORECASE) def parse_experience_years(text: str) -> int: if not text: return 0 m = _YEARS_RE.search(text) if m: try: return int(m.group(1)) except Exception: return 0 return 0 def clean_candidate(candidate: Dict[str, Any]) -> Dict[str, Any]: """Return a cleaned/normalized copy of a candidate dict. Expected keys handled: `skills` (list or comma string), `experience`, `summary`. Adds `normalized_skills` and `experience_years` when possible. """ out = dict(candidate) skills = out.get("skills") or out.get("competences") or [] if isinstance(skills, str): # split on commas/semicolons/slash skills_list = re.split(r"[,;/\\]\s*", skills) elif isinstance(skills, (list, tuple)): skills_list = list(skills) else: skills_list = [] out["normalized_skills"] = normalize_skills_list([s for s in skills_list if s]) # experience years years = out.get("experience_years") or 0 if not years: years = parse_experience_years(out.get("experience") or out.get("summary") or "") try: out["experience_years"] = int(years) except Exception: out["experience_years"] = years # dedupe and basic cleanup for education edu = out.get("education") if isinstance(edu, str): out["education"] = edu.strip() return out __all__ = [ "normalize_skill", "normalize_skills_list", "parse_experience_years", "clean_candidate", ] if __name__ == "__main__": # quick smoke test sample = { "skills": "react.js, JS, python, ml, AWS, docker-compose", "experience": "5+ years in web development", "education": "Bachelor in Computer Science", } print(clean_candidate(sample))