ai-talent-finder-backend / app /services /data_normalization.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame Contribute Delete
4.07 kB
from pathlib import Path
import json
import re
from typing import List, Dict, Any
_BASE_DIR = Path(__file__).resolve().parents[2]
_DATA_DIR = _BASE_DIR / "ai_module" / "data"
_SKILLS_FILE = _DATA_DIR / "skills_dictionary.json"
_MAPPINGS_FILE = _DATA_DIR / "skill_mappings.json"
def _load_json(path: Path) -> Dict[str, Any]:
if not path.exists():
return {}
try:
return json.loads(path.read_text(encoding="utf-8"))
except Exception:
return {}
_SKILLS_DICT = _load_json(_SKILLS_FILE)
_MAPPINGS = {k.lower(): v for k, v in _load_json(_MAPPINGS_FILE).items()}
def normalize_skill(skill: str) -> str:
"""Normalize a single skill string to a canonical form.
Steps:
- lower + strip
- basic punctuation cleanup
- map via `skill_mappings.json` if available
- if present in `skills_dictionary.json`, return the canonical spelling
- otherwise return title-cased fallback
"""
if not skill:
return ""
s = skill.strip()
s = s.replace("\n", " ")
s = re.sub(r"[\(\)\[\]\\/\\]", " ", s)
s = re.sub(r"[\.,;:]$", "", s)
s_clean = re.sub(r"\s+", " ", s).strip()
key = s_clean.lower()
# direct mapping
if key in _MAPPINGS:
return _MAPPINGS[key]
# search in skills dictionary (case-insensitive)
for cat, items in _SKILLS_DICT.items():
for item in items:
if item.lower() == key:
return item
# fuzzy-ish normalization: common replacements
replacements = {
"\bjs\b": "JavaScript",
"\bnodejs\b": "Node.js",
"\breactjs\b": "React",
"\bdevops\b": "DevOps",
}
for pattern, repl in replacements.items():
if re.search(pattern, key):
return repl
# fallback: title case acronyms preserved
if key.isupper() or len(key) <= 3:
return s_clean.upper()
return s_clean.title()
def normalize_skills_list(skills: List[str]) -> List[str]:
seen = set()
out = []
for sk in skills:
norm = normalize_skill(sk)
if not norm:
continue
if norm not in seen:
seen.add(norm)
out.append(norm)
return out
_YEARS_RE = re.compile(r"(\d{1,2})(?:\+)?\s*(?:years|ans|yrs|year)", re.IGNORECASE)
def parse_experience_years(text: str) -> int:
if not text:
return 0
m = _YEARS_RE.search(text)
if m:
try:
return int(m.group(1))
except Exception:
return 0
return 0
def clean_candidate(candidate: Dict[str, Any]) -> Dict[str, Any]:
"""Return a cleaned/normalized copy of a candidate dict.
Expected keys handled: `skills` (list or comma string), `experience`, `summary`.
Adds `normalized_skills` and `experience_years` when possible.
"""
out = dict(candidate)
skills = out.get("skills") or out.get("competences") or []
if isinstance(skills, str):
# split on commas/semicolons/slash
skills_list = re.split(r"[,;/\\]\s*", skills)
elif isinstance(skills, (list, tuple)):
skills_list = list(skills)
else:
skills_list = []
out["normalized_skills"] = normalize_skills_list([s for s in skills_list if s])
# experience years
years = out.get("experience_years") or 0
if not years:
years = parse_experience_years(out.get("experience") or out.get("summary") or "")
try:
out["experience_years"] = int(years)
except Exception:
out["experience_years"] = years
# dedupe and basic cleanup for education
edu = out.get("education")
if isinstance(edu, str):
out["education"] = edu.strip()
return out
__all__ = [
"normalize_skill",
"normalize_skills_list",
"parse_experience_years",
"clean_candidate",
]
if __name__ == "__main__":
# quick smoke test
sample = {
"skills": "react.js, JS, python, ml, AWS, docker-compose",
"experience": "5+ years in web development",
"education": "Bachelor in Computer Science",
}
print(clean_candidate(sample))