ai-talent-finder-backend / scripts /analyze_and_enrich_skills.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
5.06 kB
#!/usr/bin/env python3
"""
Analyze skills dictionary coverage and propose enrichments.
Run: PYTHONPATH=. python3 backend/scripts/analyze_and_enrich_skills.py
Generates: backend/ai_module/data/skills_enrichment_suggestions.json
"""
import json
import csv
from pathlib import Path
import re
from collections import Counter
ROOT = Path(__file__).resolve().parents[2]
SKILLS_FILE = ROOT / 'backend' / 'ai_module' / 'data' / 'skills_dictionary.json'
OUT_FILE = ROOT / 'backend' / 'ai_module' / 'data' / 'skills_enrichment_suggestions.json'
DATA_DIR = ROOT / 'data'
skill_norm_re = re.compile(r"[^a-z0-9+#+\.\- ]+")
def normalize(s: str) -> str:
s2 = s.strip().lower()
s2 = s2.replace('/', ' ').replace('&', ' and ')
s2 = skill_norm_re.sub(' ', s2)
s2 = re.sub(r'\s+', ' ', s2).strip()
return s2
def load_skills_dict():
if not SKILLS_FILE.exists():
print('Skills dictionary not found at', SKILLS_FILE)
return set()
data = json.loads(SKILLS_FILE.read_text(encoding='utf-8'))
# assume file is list of skills or dict with keys
if isinstance(data, dict):
items = []
for k, v in data.items():
if isinstance(v, list):
items.extend(v)
else:
items.append(k)
elif isinstance(data, list):
items = data
else:
items = []
normalized = set(normalize(x) for x in items if x)
return normalized
def scan_data_files():
files = list(DATA_DIR.glob('**/*.csv'))
tokens = Counter()
for f in files:
try:
with f.open('r', encoding='utf-8') as fh:
reader = csv.reader(fh)
headers = next(reader, None)
if not headers:
continue
# find indices for cv_text and job_text if present
header_map = {h: i for i, h in enumerate(headers)}
cv_idx = header_map.get('cv_text')
job_idx = header_map.get('job_text')
for row in reader:
candidates = []
if cv_idx is not None and cv_idx < len(row):
candidates.append(row[cv_idx])
if job_idx is not None and job_idx < len(row):
candidates.append(row[job_idx])
for cell in candidates:
if not cell:
continue
cell = cell.replace('\r', ' ').replace('\n', ' ')
# Try to extract after SKILLS markers
m = re.search(r"skills\s*[:\-]\s*(.+)$", cell, flags=re.IGNORECASE)
if m:
skills_blob = m.group(1)
parts = re.split(r'[;,\\|]+', skills_blob)
else:
# fallback: take last 200 characters which often include compact skill lists
tail = cell[-400:]
parts = re.split(r'[;,\\|]+', tail)
for p in parts:
p = p.strip()
if len(p) < 2:
continue
# heuristics: keep phrases up to 5 words
if len(p.split()) <= 6 and any(c.isalpha() for c in p):
tokens[normalize(p)] += 1
except Exception as e:
print('Skipping', f, 'error', e)
return tokens
def extract_skill_phrases(counter: Counter, top_n=200):
# filter out likely non-skills (e.g., long phrases)
filtered = Counter()
for k, v in counter.most_common(top_n * 5):
if len(k) < 3:
continue
words = k.split()
if len(words) > 5:
continue
# ignore sentences with verbs commonly
if any(tok in k for tok in ('experience', 'looking', 'years', 'year', 'candidate', 'engineer', 'developer')):
continue
filtered[k] = v
return Counter(filtered).most_common(top_n)
def main():
print('Loading skills dictionary...')
dict_skills = load_skills_dict()
print(f'Loaded {len(dict_skills)} normalized skills from dictionary')
print('Scanning data CSV files for skill-like phrases...')
tokens = scan_data_files()
print(f'Extracted {len(tokens)} candidate tokens from data files')
top = extract_skill_phrases(tokens, top_n=300)
missing = []
for phrase, count in top:
if phrase not in dict_skills:
missing.append({'skill': phrase, 'count': count})
print(f'Found {len(missing)} high-frequency tokens missing from dictionary (top {len(top)})')
OUT_FILE.parent.mkdir(parents=True, exist_ok=True)
OUT_FILE.write_text(json.dumps({'missing_suggestions': missing, 'summary': {'dict_count': len(dict_skills), 'tokens_scanned': len(tokens)}}, indent=2, ensure_ascii=False), encoding='utf-8')
print('Wrote suggestions to', OUT_FILE)
if __name__ == '__main__':
main()