"""Generate synthetic CV-job pairs using `backend/tests/mock_data.py`.

This creates:
- ../data/training_pairs.csv (N pairs, labels auto-generated heuristically)
- ../data/label_sample_200.csv (200 random samples for manual review)

Usage:
    python generate_synthetic_dataset.py --out ../data/training_pairs.csv --n 5000
"""
import argparse
import csv
import random
from pathlib import Path

import importlib.util
from pathlib import Path

# Load mock_data.py dynamically (works regardless of PYTHONPATH)
mock_path = Path(__file__).resolve().parents[1] / "tests" / "mock_data.py"
spec = importlib.util.spec_from_file_location("mock_data", str(mock_path))
mock_mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mock_mod)
get_mock_candidates = mock_mod.get_mock_candidates
get_mock_criteria = mock_mod.get_mock_criteria


def simple_label(candidate, job):
    # heuristic: if any skill name from candidate appears in job title or description -> positive
    text = (job.get('title','') + ' ' + job.get('description','')).lower()
    cand_skills = [s['name'].lower() for s in candidate.get('skills', [])]
    matches = sum(1 for s in cand_skills if s in text)
    # label positive if at least one exact skill match; add noise
    if matches >= 1:
        return 1
    # small chance of false positive/negative
    return 0


def generate(out_path: str, n: int = 5000):
    out = Path(out_path)
    out.parent.mkdir(parents=True, exist_ok=True)

    candidates = get_mock_candidates()
    jobs = get_mock_criteria()
    # if few candidates, create variants
    candidate_pool = []
    for cand in candidates:
        # create slight variants of raw_text
        for i in range(0, 10):
            variant = dict(cand)
            variant['id'] = int(cand['id'] * 100 + i)
            variant['raw_text'] = (cand['raw_text'] or '') + ' ' + ' '.join([s['name'] for s in cand.get('skills', [])])
            candidate_pool.append(variant)

    with out.open('w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['cv_id', 'job_id', 'cv_text', 'job_text', 'label', 'auto_label'])
        writer.writeheader()

        count = 0
        while count < n:
            cand = random.choice(candidate_pool)
            job = random.choice(jobs)
            cv_text = cand.get('raw_text','')
            job_text = (job.get('title','') + '\n' + job.get('description',''))
            label = simple_label(cand, job)
            # add controlled noise: flip label with 5% chance
            if random.random() < 0.05:
                label = 1 - label
            writer.writerow({
                'cv_id': cand['id'],
                'job_id': job['id'],
                'cv_text': cv_text.replace('\n',' ')[:10000],
                'job_text': job_text.replace('\n',' ')[:5000],
                'label': label,
                'auto_label': 1,
            })
            count += 1

    # produce sample 200
    sample_out = out.parent / 'label_sample_200.csv'
    with out.open('r', encoding='utf-8') as f:
        rows = list(csv.DictReader(f))
    sample = random.sample(rows, min(200, len(rows)))
    with sample_out.open('w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=rows[0].keys())
        writer.writeheader()
        for r in sample:
            writer.writerow(r)

    print(f'Generated {n} synthetic pairs -> {out_path}')
    print(f'Wrote sample 200 -> {sample_out}')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--out', required=True)
    parser.add_argument('--n', type=int, default=5000)
    args = parser.parse_args()
    generate(args.out, args.n)