Spaces:

build-small-hackathon
/

lesson-agent

Running on Zero

App Files Files Community

MSGEncrypted commited on 24 days ago

Commit

9341111

1 Parent(s): 8ccf67b

fix

Browse files

Files changed (1) hide show

notebook/gemma-finetune.ipynb +98 -2

notebook/gemma-finetune.ipynb CHANGED Viewed

@@ -238,6 +238,103 @@
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "metadata": {
@@ -364,8 +461,7 @@
         "# Option B — wrap manually, omit peft_config from SFTTrainer:\n",
         "#   tuned_model = get_peft_model(tuned_model, lora_config)\n",
         "#   trainer = SFTTrainer(model=tuned_model, ...)  # no peft_config\n",
-        "\n",
-        ""
       ],
       "execution_count": null,
       "outputs": []

       "execution_count": null,
       "outputs": []
     },
+    {
+      "cell_type": "code",
+      "metadata": {},
+      "source": [
+        "\n",
+        "# Prepare model for k-bit training\n",
+        "tuned_model = prepare_model_for_kbit_training(tuned_model)\n",
+        "\n",
+        "# --- 2. Configure LoRA ---\n",
+        "lora_config = LoraConfig(\n",
+        "    r=16, # LoRA attention dimension\n",
+        "    lora_alpha=16, # Alpha parameter for LoRA scaling\n",
+        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], # Target all linear layers\n",
+        "    lora_dropout=0.05, # Dropout probability for LoRA layers\n",
+        "    bias=\"none\", # Only add bias to the LoRA layers\n",
+        "    task_type=\"CAUSAL_LM\", # Task type for causal language modeling\n",
+        ")\n",
+        "\n",
+        "# Do NOT call get_peft_model() here — SFTTrainer wraps the model when peft_config is passed.\n",
+        "# tuned_model = get_peft_model(tuned_model, lora_config)\n",
+        "\n",
+        "# --- 3. Prepare a Sample Dataset ---\n",
+        "# For a real-world scenario, you would load your own dataset using `load_dataset`\n",
+        "# from the `datasets` library and format it appropriately.\n",
+        "# This is a simple dummy dataset for demonstration.\n",
+        "\n",
+        "# Example instruction tuning dataset format\n",
+        "data = {\n",
+        "    \"text\": [\n",
+        "        \"<start_of_turn>user\\nWhat is the capital of France?<end_of_turn>\\n<start_of_turn>model\\nParis is the capital of France.<end_of_turn>\",\n",
+        "        \"<start_of_turn>user\\nSuggest a healthy snack.\\n<end_of_turn>\\n<start_of_turn>model\\nAlmonds or a piece of fruit like an apple are great healthy snack options.<end_of_turn>\",\n",
+        "        \"<start_of_turn>user\\nExplain the concept of photosynthesis.\\n<end_of_turn>\\n<start_of_turn>model\\nPhotosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy.<end_of_turn>\"\n",
+        "    ]\n",
+        "}\n",
+        "\n",
+        "dataset = Dataset.from_dict(data)\n",
+        "\n",
+        "# --- 4. Define Training Arguments ---\n",
+        "from transformers import TrainingArguments\n",
+        "\n",
+        "training_args = TrainingArguments(\n",
+        "    output_dir=\"./gemma_finetuned\", # Output directory for checkpoints and logs\n",
+        "    num_train_epochs=1, # Number of training epochs\n",
+        "    per_device_train_batch_size=2, # Batch size per GPU/CPU for training\n",
+        "    gradient_accumulation_steps=2, # Number of updates steps to accumulate before performing a backward/update pass\n",
+        "    optim=\"paged_adamw_8bit\", # Optimizer to use\n",
+        "    save_steps=100, # Save checkpoint every X updates steps\n",
+        "    logging_steps=10, # Log every X updates steps\n",
+        "    learning_rate=2e-4, # Initial learning rate for AdamW optimizer\n",
+        "    weight_decay=0.001, # Weight decay for AdamW\n",
+        "    fp16=False, # Must match bnb_4bit_compute_dtype (bf16 below)\n",
+        "    bf16=True,  # Use bf16 when bnb_4bit_compute_dtype=torch.bfloat16\n",
+        "    max_grad_norm=0.3, # Max gradient norm\n",
+        "    max_steps=-1, # Don't limit training by steps, use epochs\n",
+        "    warmup_ratio=0.03, # Ratio of total steps for a linear warmup from 0 to learning_rate\n",
+        "    # group_by_length=True, # Group sequences of roughly the same length together to speed up training\n",
+        "    lr_scheduler_type=\"constant\", # Learning rate scheduler type\n",
+        "    report_to=\"none\" # Disable reporting to any tracking service\n",
+        ")\n",
+        "\n",
+        "# --- 5. Initialize and Run SFTTrainer ---\n",
+        "\n",
+        "trainer = SFTTrainer(\n",
+        "    model=tuned_model,          # plain (non-PEFT) base model\n",
+        "    train_dataset=dataset,\n",
+        "    peft_config=lora_config,    # SFTTrainer applies LoRA internally\n",
+        "    # dataset_text_field=\"text\", # Name of the column containing the text data\n",
+        "    # tokenizer=tokenizer,\n",
+        "    args=training_args,\n",
+        "    # packing=False, # Whether to pack multiple short examples into one longer sequence to improve efficiency\n",
+        "    # max_seq_length=512, # Max sequence length to use for training\n",
+        ")\n",
+        "\n",
+        "print(\"Starting finetuning...\")\n",
+        "trainer.train()\n",
+        "print(\"Finetuning complete!\")\n",
+        "\n",
+        "# --- 6. Save the LoRA adapter ---\n",
+        "trainer.model.save_pretrained(\"./gemma_finetuned_model\")\n",
+        "tokenizer.save_pretrained(\"./gemma_finetuned_model\")\n",
+        "\n",
+        "# --- 7. (Optional) Merge LoRA adapters for inference ---\n",
+        "# Merge in-memory from the trained model (avoids AutoPeft reload + torchao version issues).\n",
+        "merged_model = trainer.model.merge_and_unload()\n",
+        "merged_model.save_pretrained(\"gemma_merged_model\", safe_serialization=True)\n",
+        "tokenizer.save_pretrained(\"gemma_merged_model\")\n",
+        "\n",
+        "# If you need to reload the adapter from disk later instead, upgrade torchao first:\n",
+        "# !pip install -U \"torchao>=0.16.0\"\n",
+        "# from peft import PeftModel\n",
+        "# base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=\"auto\")\n",
+        "# peft_model = PeftModel.from_pretrained(base_model, \"./gemma_finetuned_model\")\n",
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "metadata": {
         "# Option B — wrap manually, omit peft_config from SFTTrainer:\n",
         "#   tuned_model = get_peft_model(tuned_model, lora_config)\n",
         "#   trainer = SFTTrainer(model=tuned_model, ...)  # no peft_config\n",
+        "\n"
       ],
       "execution_count": null,
       "outputs": []