| """Generate synthetic CV-job pairs using `backend/tests/mock_data.py`. |
| |
| This creates: |
| - ../data/training_pairs.csv (N pairs, labels auto-generated heuristically) |
| - ../data/label_sample_200.csv (200 random samples for manual review) |
| |
| Usage: |
| python generate_synthetic_dataset.py --out ../data/training_pairs.csv --n 5000 |
| """ |
| import argparse |
| import csv |
| import random |
| from pathlib import Path |
|
|
| import importlib.util |
| from pathlib import Path |
|
|
| |
| mock_path = Path(__file__).resolve().parents[1] / "tests" / "mock_data.py" |
| spec = importlib.util.spec_from_file_location("mock_data", str(mock_path)) |
| mock_mod = importlib.util.module_from_spec(spec) |
| spec.loader.exec_module(mock_mod) |
| get_mock_candidates = mock_mod.get_mock_candidates |
| get_mock_criteria = mock_mod.get_mock_criteria |
|
|
|
|
| def simple_label(candidate, job): |
| |
| text = (job.get('title','') + ' ' + job.get('description','')).lower() |
| cand_skills = [s['name'].lower() for s in candidate.get('skills', [])] |
| matches = sum(1 for s in cand_skills if s in text) |
| |
| if matches >= 1: |
| return 1 |
| |
| return 0 |
|
|
|
|
| def generate(out_path: str, n: int = 5000): |
| out = Path(out_path) |
| out.parent.mkdir(parents=True, exist_ok=True) |
|
|
| candidates = get_mock_candidates() |
| jobs = get_mock_criteria() |
| |
| candidate_pool = [] |
| for cand in candidates: |
| |
| for i in range(0, 10): |
| variant = dict(cand) |
| variant['id'] = int(cand['id'] * 100 + i) |
| variant['raw_text'] = (cand['raw_text'] or '') + ' ' + ' '.join([s['name'] for s in cand.get('skills', [])]) |
| candidate_pool.append(variant) |
|
|
| with out.open('w', newline='', encoding='utf-8') as f: |
| writer = csv.DictWriter(f, fieldnames=['cv_id', 'job_id', 'cv_text', 'job_text', 'label', 'auto_label']) |
| writer.writeheader() |
|
|
| count = 0 |
| while count < n: |
| cand = random.choice(candidate_pool) |
| job = random.choice(jobs) |
| cv_text = cand.get('raw_text','') |
| job_text = (job.get('title','') + '\n' + job.get('description','')) |
| label = simple_label(cand, job) |
| |
| if random.random() < 0.05: |
| label = 1 - label |
| writer.writerow({ |
| 'cv_id': cand['id'], |
| 'job_id': job['id'], |
| 'cv_text': cv_text.replace('\n',' ')[:10000], |
| 'job_text': job_text.replace('\n',' ')[:5000], |
| 'label': label, |
| 'auto_label': 1, |
| }) |
| count += 1 |
|
|
| |
| sample_out = out.parent / 'label_sample_200.csv' |
| with out.open('r', encoding='utf-8') as f: |
| rows = list(csv.DictReader(f)) |
| sample = random.sample(rows, min(200, len(rows))) |
| with sample_out.open('w', newline='', encoding='utf-8') as f: |
| writer = csv.DictWriter(f, fieldnames=rows[0].keys()) |
| writer.writeheader() |
| for r in sample: |
| writer.writerow(r) |
|
|
| print(f'Generated {n} synthetic pairs -> {out_path}') |
| print(f'Wrote sample 200 -> {sample_out}') |
|
|
|
|
| if __name__ == '__main__': |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--out', required=True) |
| parser.add_argument('--n', type=int, default=5000) |
| args = parser.parse_args() |
| generate(args.out, args.n) |
|
|