# Experimental higher-capacity recipe for RTX 6000 Ada 48/50GB.
# Use only if Qwen3-8B underfits; reduce batch size first if OOM.
model_name_or_path: Qwen/Qwen3-14B
dataset_name: nraptisss/TMF921-intent-to-config-research-sota
train_split: train_sota
eval_split: validation
output_dir: outputs/qwen3-14b-tmf921-qlora
hub_model_id: nraptisss/Qwen3-14B-TMF921-Intent-QLoRA-ResearchSOTA
max_length: 2048
packing: false
assistant_only_loss: true
dataset_num_proc: 8
load_in_4bit: true
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: true
lora_r: 64
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: all-linear
epochs: 2
learning_rate: 0.0002
lr_scheduler_type: constant
warmup_steps: 0
weight_decay: 0.0
max_grad_norm: 0.3
per_device_train_batch_size: 1
gradient_accumulation_steps: 16
per_device_eval_batch_size: 1
bf16: true
gradient_checkpointing: true
optim: paged_adamw_32bit
logging_steps: 10
eval_steps: 250
save_steps: 250
save_total_limit: 3
run_name: qwen3-14b-tmf921-qlora-r64
project: tmf921-intent-sft
trackio_space_id: null
push_to_hub: true
eval_splits:
  - test_in_distribution
  - test_template_ood
  - test_use_case_ood
  - test_sector_ood
  - test_adversarial
generation_max_new_tokens: 2048
generation_temperature: 0.0
generation_top_p: 1.0
eval_max_samples_per_split: null