Spaces:

RHmaster
/

ai-talent-finder-backend

Running

ai-talent-finder-backend / scripts /generate_synthetic_dataset.py

ilyass yani

Deploiement backend dans HF Spaces

9df97a2 10 days ago

3.69 kB

	"""Generate synthetic CV-job pairs using `backend/tests/mock_data.py`.

	This creates:
	- ../data/training_pairs.csv (N pairs, labels auto-generated heuristically)
	- ../data/label_sample_200.csv (200 random samples for manual review)

	Usage:
	python generate_synthetic_dataset.py --out ../data/training_pairs.csv --n 5000
	"""
	import argparse
	import csv
	import random
	from pathlib import Path

	import importlib.util
	from pathlib import Path

	# Load mock_data.py dynamically (works regardless of PYTHONPATH)
	mock_path = Path(__file__).resolve().parents[1] / "tests" / "mock_data.py"
	spec = importlib.util.spec_from_file_location("mock_data", str(mock_path))
	mock_mod = importlib.util.module_from_spec(spec)
	spec.loader.exec_module(mock_mod)
	get_mock_candidates = mock_mod.get_mock_candidates
	get_mock_criteria = mock_mod.get_mock_criteria


	def simple_label(candidate, job):
	# heuristic: if any skill name from candidate appears in job title or description -> positive
	text = (job.get('title','') + ' ' + job.get('description','')).lower()
	cand_skills = [s['name'].lower() for s in candidate.get('skills', [])]
	matches = sum(1 for s in cand_skills if s in text)
	# label positive if at least one exact skill match; add noise
	if matches >= 1:
	return 1
	# small chance of false positive/negative
	return 0


	def generate(out_path: str, n: int = 5000):
	out = Path(out_path)
	out.parent.mkdir(parents=True, exist_ok=True)

	candidates = get_mock_candidates()
	jobs = get_mock_criteria()
	# if few candidates, create variants
	candidate_pool = []
	for cand in candidates:
	# create slight variants of raw_text
	for i in range(0, 10):
	variant = dict(cand)
	variant['id'] = int(cand['id'] * 100 + i)
	variant['raw_text'] = (cand['raw_text'] or '') + ' ' + ' '.join([s['name'] for s in cand.get('skills', [])])
	candidate_pool.append(variant)

	with out.open('w', newline='', encoding='utf-8') as f:
	writer = csv.DictWriter(f, fieldnames=['cv_id', 'job_id', 'cv_text', 'job_text', 'label', 'auto_label'])
	writer.writeheader()

	count = 0
	while count < n:
	cand = random.choice(candidate_pool)
	job = random.choice(jobs)
	cv_text = cand.get('raw_text','')
	job_text = (job.get('title','') + '\n' + job.get('description',''))
	label = simple_label(cand, job)
	# add controlled noise: flip label with 5% chance
	if random.random() < 0.05:
	label = 1 - label
	writer.writerow({
	'cv_id': cand['id'],
	'job_id': job['id'],
	'cv_text': cv_text.replace('\n',' ')[:10000],
	'job_text': job_text.replace('\n',' ')[:5000],
	'label': label,
	'auto_label': 1,
	})
	count += 1

	# produce sample 200
	sample_out = out.parent / 'label_sample_200.csv'
	with out.open('r', encoding='utf-8') as f:
	rows = list(csv.DictReader(f))
	sample = random.sample(rows, min(200, len(rows)))
	with sample_out.open('w', newline='', encoding='utf-8') as f:
	writer = csv.DictWriter(f, fieldnames=rows[0].keys())
	writer.writeheader()
	for r in sample:
	writer.writerow(r)

	print(f'Generated {n} synthetic pairs -> {out_path}')
	print(f'Wrote sample 200 -> {sample_out}')


	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('--out', required=True)
	parser.add_argument('--n', type=int, default=5000)
	args = parser.parse_args()
	generate(args.out, args.n)