# Experimental higher-capacity recipe for RTX 6000 Ada 48/50GB. # Use only if Qwen3-8B underfits; reduce batch size first if OOM. model_name_or_path: Qwen/Qwen3-14B dataset_name: nraptisss/TMF921-intent-to-config-research-sota train_split: train_sota eval_split: validation output_dir: outputs/qwen3-14b-tmf921-qlora hub_model_id: nraptisss/Qwen3-14B-TMF921-Intent-QLoRA-ResearchSOTA max_length: 2048 packing: false assistant_only_loss: true dataset_num_proc: 8 load_in_4bit: true bnb_4bit_quant_type: nf4 bnb_4bit_use_double_quant: true lora_r: 64 lora_alpha: 16 lora_dropout: 0.05 lora_target_modules: all-linear epochs: 2 learning_rate: 0.0002 lr_scheduler_type: constant warmup_steps: 0 weight_decay: 0.0 max_grad_norm: 0.3 per_device_train_batch_size: 1 gradient_accumulation_steps: 16 per_device_eval_batch_size: 1 bf16: true gradient_checkpointing: true optim: paged_adamw_32bit logging_steps: 10 eval_steps: 250 save_steps: 250 save_total_limit: 3 run_name: qwen3-14b-tmf921-qlora-r64 project: tmf921-intent-sft trackio_space_id: null push_to_hub: true eval_splits: - test_in_distribution - test_template_ood - test_use_case_ood - test_sector_ood - test_adversarial generation_max_new_tokens: 2048 generation_temperature: 0.0 generation_top_p: 1.0 eval_max_samples_per_split: null