#!/usr/bin/env python3 """ Adapter 3: Vision / UI / Frontend — SFT Training (Multimodal) Model: Qwen/Qwen3.6-27B (AutoModelForImageTextToText) Dataset: keypa/qwen36-adapter-vision-sft (WebSight subset) """ import os from datasets import load_dataset from peft import LoraConfig from trl import SFTConfig, SFTTrainer from transformers import AutoModelForImageTextToText, AutoTokenizer import trackio MODEL_ID = "Qwen/Qwen3.6-27B" DATASET_ID = "keypa/qwen36-adapter-vision-sft" FALLBACK_DATASET = "HuggingFaceM4/WebSight" OUTPUT_DIR = "./outputs/adapter-vision-sft" HUB_MODEL_ID = "keypa/qwen36-27b-adapter-vision-sft" TRACKIO_PROJECT = "qwen36-adapters" TRACKIO_RUN_NAME = "vision-sft-r16-lr2e5" LORA_RANK = 16 LORA_ALPHA = 32 LORA_DROPOUT = 0.05 TARGET_MODULES = "all-linear" NUM_EPOCHS = 2 PER_DEVICE_BATCH = 1 GRAD_ACCUM = 8 LEARNING_RATE = 2e-5 WARMUP_RATIO = 0.1 MAX_LENGTH = 4096 FREEZE_VISION_TOWER = True def main(): trackio.init(project=TRACKIO_PROJECT, run_name=TRACKIO_RUN_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"Loading {MODEL_ID}...") model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, torch_dtype="bfloat16", device_map="auto", trust_remote_code=True, ) if FREEZE_VISION_TOWER and hasattr(model, "visual"): print("Freezing vision tower...") for param in model.visual.parameters(): param.requires_grad = False print(f"Model loaded. Params: {sum(p.numel() for p in model.parameters()) / 1e9:.1f}B") peft_config = LoraConfig( r=LORA_RANK, lora_alpha=LORA_ALPHA, target_modules=TARGET_MODULES, lora_dropout=LORA_DROPOUT, bias="none", task_type="CAUSAL_LM", ) training_args = SFTConfig( output_dir=OUTPUT_DIR, num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=PER_DEVICE_BATCH, gradient_accumulation_steps=GRAD_ACCUM, learning_rate=LEARNING_RATE, warmup_ratio=WARMUP_RATIO, max_length=MAX_LENGTH, packing=True, packing_strategy="bfd", bf16=True, gradient_checkpointing=True, logging_strategy="steps", logging_steps=10, logging_first_step=True, save_strategy="epoch", save_total_limit=2, push_to_hub=True, hub_model_id=HUB_MODEL_ID, report_to="trackio", run_name=TRACKIO_RUN_NAME, project_name=TRACKIO_PROJECT, disable_tqdm=True, ) try: train_dataset = load_dataset(DATASET_ID, split="train") except Exception: from datasets import Dataset ds = load_dataset(FALLBACK_DATASET, "v0.2", split="train", streaming=True) examples = [] for i, ex in enumerate(ds): if i >= 20000: break messages = [ {"role": "user", "content": [ {"type": "image", "image": ex["image"]}, {"type": "text", "text": "Generate HTML/CSS for this screenshot."} ]}, {"role": "assistant", "content": ex.get("text", "")} ] examples.append({"messages": messages}) train_dataset = Dataset.from_list(examples) print(f"Vision dataset: {len(train_dataset)} examples") trainer = SFTTrainer( model=model, args=training_args, train_dataset=train_dataset, peft_config=peft_config, tokenizer=tokenizer, ) print("\nStarting Vision/UI SFT training") print(f" LoRA r={LORA_RANK}, alpha={LORA_ALPHA}, LR={LEARNING_RATE}, MaxLen={MAX_LENGTH}") trainer.train() trainer.save_model(os.path.join(OUTPUT_DIR, "final")) print(f"\nDone! Saved to {OUTPUT_DIR}/final") trackio.alert(title="Done: Vision SFT", text=f"Completed {NUM_EPOCHS} epochs. Model at {HUB_MODEL_ID}", level="INFO") if __name__ == "__main__": main()