ai-talent-finder-backend / scripts /generate_synthetic_dataset.py
ilyass yani
Deploiement backend dans HF Spaces
9df97a2
Raw
History Blame
3.69 kB
"""Generate synthetic CV-job pairs using `backend/tests/mock_data.py`.
This creates:
- ../data/training_pairs.csv (N pairs, labels auto-generated heuristically)
- ../data/label_sample_200.csv (200 random samples for manual review)
Usage:
python generate_synthetic_dataset.py --out ../data/training_pairs.csv --n 5000
"""
import argparse
import csv
import random
from pathlib import Path
import importlib.util
from pathlib import Path
# Load mock_data.py dynamically (works regardless of PYTHONPATH)
mock_path = Path(__file__).resolve().parents[1] / "tests" / "mock_data.py"
spec = importlib.util.spec_from_file_location("mock_data", str(mock_path))
mock_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mock_mod)
get_mock_candidates = mock_mod.get_mock_candidates
get_mock_criteria = mock_mod.get_mock_criteria
def simple_label(candidate, job):
# heuristic: if any skill name from candidate appears in job title or description -> positive
text = (job.get('title','') + ' ' + job.get('description','')).lower()
cand_skills = [s['name'].lower() for s in candidate.get('skills', [])]
matches = sum(1 for s in cand_skills if s in text)
# label positive if at least one exact skill match; add noise
if matches >= 1:
return 1
# small chance of false positive/negative
return 0
def generate(out_path: str, n: int = 5000):
out = Path(out_path)
out.parent.mkdir(parents=True, exist_ok=True)
candidates = get_mock_candidates()
jobs = get_mock_criteria()
# if few candidates, create variants
candidate_pool = []
for cand in candidates:
# create slight variants of raw_text
for i in range(0, 10):
variant = dict(cand)
variant['id'] = int(cand['id'] * 100 + i)
variant['raw_text'] = (cand['raw_text'] or '') + ' ' + ' '.join([s['name'] for s in cand.get('skills', [])])
candidate_pool.append(variant)
with out.open('w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=['cv_id', 'job_id', 'cv_text', 'job_text', 'label', 'auto_label'])
writer.writeheader()
count = 0
while count < n:
cand = random.choice(candidate_pool)
job = random.choice(jobs)
cv_text = cand.get('raw_text','')
job_text = (job.get('title','') + '\n' + job.get('description',''))
label = simple_label(cand, job)
# add controlled noise: flip label with 5% chance
if random.random() < 0.05:
label = 1 - label
writer.writerow({
'cv_id': cand['id'],
'job_id': job['id'],
'cv_text': cv_text.replace('\n',' ')[:10000],
'job_text': job_text.replace('\n',' ')[:5000],
'label': label,
'auto_label': 1,
})
count += 1
# produce sample 200
sample_out = out.parent / 'label_sample_200.csv'
with out.open('r', encoding='utf-8') as f:
rows = list(csv.DictReader(f))
sample = random.sample(rows, min(200, len(rows)))
with sample_out.open('w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=rows[0].keys())
writer.writeheader()
for r in sample:
writer.writerow(r)
print(f'Generated {n} synthetic pairs -> {out_path}')
print(f'Wrote sample 200 -> {sample_out}')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--out', required=True)
parser.add_argument('--n', type=int, default=5000)
args = parser.parse_args()
generate(args.out, args.n)