{
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "execution": {
          "iopub.execute_input": "2026-06-07T13:53:33.681800Z",
          "iopub.status.busy": "2026-06-07T13:53:33.681410Z",
          "iopub.status.idle": "2026-06-07T13:53:47.949186Z",
          "shell.execute_reply": "2026-06-07T13:53:47.948138Z",
          "shell.execute_reply.started": "2026-06-07T13:53:33.681762Z"
        },
        "trusted": true
      },
      "source": [
        "!pip install -U transformers"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "colab_type": "text"
      },
      "source": [
        "## Local Inference on GPU \n",
        "Model page: https://huggingface.co/google/gemma-4-E2B-it-qat-mobile-transformers\n",
        "\n",
        "⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google/gemma-4-E2B-it-qat-mobile-transformers)\n",
        "\t\t\tand/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T14:00:05.743785Z",
          "iopub.status.busy": "2026-06-07T14:00:05.742974Z",
          "iopub.status.idle": "2026-06-07T14:00:05.748408Z",
          "shell.execute_reply": "2026-06-07T14:00:05.747778Z",
          "shell.execute_reply.started": "2026-06-07T14:00:05.743756Z"
        },
        "trusted": true
      },
      "source": [
        "# Load model directly\n",
        "from transformers import AutoModel\n",
        "import torch\n",
        "\n",
        "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
        "\n",
        "model_name_gemma=\"google/gemma-4-E2B-it-qat-mobile-transformers\"\n",
        "model_name=\"openbmb/MiniCPM5-1B\"\n",
        "model_openbmb=\"openbmb/MiniCPM5-1B\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "execution": {
          "iopub.execute_input": "2026-06-07T13:58:21.771701Z",
          "iopub.status.busy": "2026-06-07T13:58:21.770950Z",
          "iopub.status.idle": "2026-06-07T13:58:42.868203Z",
          "shell.execute_reply": "2026-06-07T13:58:42.867564Z",
          "shell.execute_reply.started": "2026-06-07T13:58:21.771669Z"
        },
        "trusted": true
      },
      "source": [
        "\n",
        "model = AutoModel.from_pretrained(model_name, dtype=\"auto\").to(device)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T14:00:17.480009Z",
          "iopub.status.busy": "2026-06-07T14:00:17.479002Z",
          "iopub.status.idle": "2026-06-07T14:00:28.838734Z",
          "shell.execute_reply": "2026-06-07T14:00:28.837707Z",
          "shell.execute_reply.started": "2026-06-07T14:00:17.479974Z"
        },
        "trusted": true
      },
      "source": [
        "\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(\"openbmb/MiniCPM5-1B\")\n",
        "model = AutoModelForCausalLM.from_pretrained(\"openbmb/MiniCPM5-1B\")\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T14:00:51.808940Z",
          "iopub.status.busy": "2026-06-07T14:00:51.808046Z",
          "iopub.status.idle": "2026-06-07T14:00:59.481653Z",
          "shell.execute_reply": "2026-06-07T14:00:59.480980Z",
          "shell.execute_reply.started": "2026-06-07T14:00:51.808892Z"
        },
        "trusted": true
      },
      "source": [
        "messages = [\n",
        "    {\"role\": \"user\", \"content\": \"Who are you?\"},\n",
        "]\n",
        "inputs = tokenizer.apply_chat_template(\n",
        "\tmessages,\n",
        "\tadd_generation_prompt=True,\n",
        "\ttokenize=True,\n",
        "\treturn_dict=True,\n",
        "\treturn_tensors=\"pt\",\n",
        ").to(model.device)\n",
        "\n",
        "outputs = model.generate(**inputs, max_new_tokens=120)\n",
        "print(tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[-1]:]))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T13:59:02.169453Z",
          "iopub.status.busy": "2026-06-07T13:59:02.168479Z",
          "iopub.status.idle": "2026-06-07T13:59:04.394628Z",
          "shell.execute_reply": "2026-06-07T13:59:04.393301Z",
          "shell.execute_reply.started": "2026-06-07T13:59:02.169418Z"
        },
        "trusted": true
      },
      "source": [
        "# from transformers import AutoTokenizer, pipeline\n",
        "\n",
        "# tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "\n",
        "# pipeline = pipeline(\n",
        "#     \"text-generation\",\n",
        "#     model=model,\n",
        "#     tokenizer=tokenizer,\n",
        "#     model_kwargs={\"torch_dtype\": \"auto\"},\n",
        "# )\n",
        "\n",
        "# # Example of generating chat/text\n",
        "# messages = [\n",
        "#     {\"role\": \"user\", \"content\": \"What is your favorite color?\"},\n",
        "# ]\n",
        "\n",
        "# prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
        "# outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)\n",
        "# print(outputs[0][\"generated_text\"])"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Finetuning"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T14:01:09.304174Z",
          "iopub.status.busy": "2026-06-07T14:01:09.303286Z",
          "iopub.status.idle": "2026-06-07T14:01:32.898893Z",
          "shell.execute_reply": "2026-06-07T14:01:32.897942Z",
          "shell.execute_reply.started": "2026-06-07T14:01:09.304140Z"
        },
        "trusted": true
      },
      "source": [
        "# Install necessary libraries for finetuning (if not already installed)\n",
        "!pip install -q -U accelerate peft bitsandbytes transformers trl datasets\n",
        "\n",
        "import torch\n",
        "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n",
        "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n",
        "from trl import SFTTrainer\n",
        "from datasets import Dataset\n",
        "\n",
        "# --- 1. Load Model and Tokenizer for Finetuning with QLoRA ---\n",
        "# The `model` variable from a previous cell is AutoModel, for finetuning\n",
        "# we typically need AutoModelForCausalLM with BitsAndBytesConfig.\n",
        "# Let's reload it for clarity in this finetuning example.\n",
        "\n",
        "# model_id = \"google/gemma-4-E2B-it-qat-mobile-transformers\"\n",
        "model_id = model_openbmb\n",
        "\n",
        "# Configure 4-bit quantization\n",
        "bnb_config = BitsAndBytesConfig(\n",
        "    load_in_4bit=True,\n",
        "    bnb_4bit_quant_type=\"nf4\",\n",
        "    bnb_4bit_compute_dtype=torch.bfloat16,\n",
        "    bnb_4bit_use_double_quant=False,\n",
        ")\n",
        "\n",
        "# Load the model with 4-bit quantization\n",
        "# Assuming 'model' from earlier cell is a suitable base for this if not reloaded\n",
        "# If you've already loaded it as `AutoModel`, you might need to ensure it's `AutoModelForCausalLM`\n",
        "# and can be quantizied. For a robust finetuning example, it's safer to load it here again.\n",
        "\n",
        "tuned_model = AutoModelForCausalLM.from_pretrained(\n",
        "    model_id,\n",
        "    quantization_config=bnb_config,\n",
        "    device_map=\"auto\",\n",
        ")\n",
        "\n",
        "tuned_model.config.use_cache = False\n",
        "tuned_model.config.pretraining_tp = 1\n",
        "\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n",
        "tokenizer.pad_token = tokenizer.eos_token\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "\n",
        "# Prepare model for k-bit training\n",
        "tuned_model = prepare_model_for_kbit_training(tuned_model)\n",
        "\n",
        "# --- 2. Configure LoRA ---\n",
        "lora_config = LoraConfig(\n",
        "    r=16, # LoRA attention dimension\n",
        "    lora_alpha=16, # Alpha parameter for LoRA scaling\n",
        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], # Target all linear layers\n",
        "    lora_dropout=0.05, # Dropout probability for LoRA layers\n",
        "    bias=\"none\", # Only add bias to the LoRA layers\n",
        "    task_type=\"CAUSAL_LM\", # Task type for causal language modeling\n",
        ")\n",
        "\n",
        "# Do NOT call get_peft_model() here — SFTTrainer wraps the model when peft_config is passed.\n",
        "# tuned_model = get_peft_model(tuned_model, lora_config)\n",
        "\n",
        "# --- 3. Prepare a Sample Dataset ---\n",
        "# For a real-world scenario, you would load your own dataset using `load_dataset`\n",
        "# from the `datasets` library and format it appropriately.\n",
        "# This is a simple dummy dataset for demonstration.\n",
        "\n",
        "# Example instruction tuning dataset format\n",
        "data = {\n",
        "    \"text\": [\n",
        "        \"<start_of_turn>user\\nWhat is the capital of France?<end_of_turn>\\n<start_of_turn>model\\nParis is the capital of France.<end_of_turn>\",\n",
        "        \"<start_of_turn>user\\nSuggest a healthy snack.\\n<end_of_turn>\\n<start_of_turn>model\\nAlmonds or a piece of fruit like an apple are great healthy snack options.<end_of_turn>\",\n",
        "        \"<start_of_turn>user\\nExplain the concept of photosynthesis.\\n<end_of_turn>\\n<start_of_turn>model\\nPhotosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy.<end_of_turn>\"\n",
        "    ]\n",
        "}\n",
        "\n",
        "dataset = Dataset.from_dict(data)\n",
        "\n",
        "# --- 4. Define Training Arguments ---\n",
        "from transformers import TrainingArguments\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./gemma_finetuned\", # Output directory for checkpoints and logs\n",
        "    num_train_epochs=1, # Number of training epochs\n",
        "    per_device_train_batch_size=2, # Batch size per GPU/CPU for training\n",
        "    gradient_accumulation_steps=2, # Number of updates steps to accumulate before performing a backward/update pass\n",
        "    optim=\"paged_adamw_8bit\", # Optimizer to use\n",
        "    save_steps=100, # Save checkpoint every X updates steps\n",
        "    logging_steps=10, # Log every X updates steps\n",
        "    learning_rate=2e-4, # Initial learning rate for AdamW optimizer\n",
        "    weight_decay=0.001, # Weight decay for AdamW\n",
        "    fp16=False, # Must match bnb_4bit_compute_dtype (bf16 below)\n",
        "    bf16=True,  # Use bf16 when bnb_4bit_compute_dtype=torch.bfloat16\n",
        "    max_grad_norm=0.3, # Max gradient norm\n",
        "    max_steps=-1, # Don't limit training by steps, use epochs\n",
        "    warmup_ratio=0.03, # Ratio of total steps for a linear warmup from 0 to learning_rate\n",
        "    # group_by_length=True, # Group sequences of roughly the same length together to speed up training\n",
        "    lr_scheduler_type=\"constant\", # Learning rate scheduler type\n",
        "    report_to=\"none\" # Disable reporting to any tracking service\n",
        ")\n",
        "\n",
        "# --- 5. Initialize and Run SFTTrainer ---\n",
        "\n",
        "trainer = SFTTrainer(\n",
        "    model=tuned_model,          # plain (non-PEFT) base model\n",
        "    train_dataset=dataset,\n",
        "    peft_config=lora_config,    # SFTTrainer applies LoRA internally\n",
        "    # dataset_text_field=\"text\", # Name of the column containing the text data\n",
        "    # tokenizer=tokenizer,\n",
        "    args=training_args,\n",
        "    # packing=False, # Whether to pack multiple short examples into one longer sequence to improve efficiency\n",
        "    # max_seq_length=512, # Max sequence length to use for training\n",
        ")\n",
        "\n",
        "print(\"Starting finetuning...\")\n",
        "trainer.train()\n",
        "print(\"Finetuning complete!\")\n",
        "\n",
        "# --- 6. Save the LoRA adapter ---\n",
        "trainer.model.save_pretrained(\"./gemma_finetuned_model\")\n",
        "tokenizer.save_pretrained(\"./gemma_finetuned_model\")\n",
        "\n",
        "# --- 7. (Optional) Merge LoRA adapters for inference ---\n",
        "# Merge in-memory from the trained model (avoids AutoPeft reload + torchao version issues).\n",
        "merged_model = trainer.model.merge_and_unload()\n",
        "merged_model.save_pretrained(\"gemma_merged_model\", safe_serialization=True)\n",
        "tokenizer.save_pretrained(\"gemma_merged_model\")\n",
        "\n",
        "# If you need to reload the adapter from disk later instead, upgrade torchao first:\n",
        "# !pip install -U \"torchao>=0.16.0\"\n",
        "# from peft import PeftModel\n",
        "# base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=\"auto\")\n",
        "# peft_model = PeftModel.from_pretrained(base_model, \"./gemma_finetuned_model\")\n",
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "execution": {
          "iopub.execute_input": "2026-06-07T14:02:33.440153Z",
          "iopub.status.busy": "2026-06-07T14:02:33.439372Z",
          "iopub.status.idle": "2026-06-07T14:02:36.463743Z",
          "shell.execute_reply": "2026-06-07T14:02:36.462765Z",
          "shell.execute_reply.started": "2026-06-07T14:02:33.440118Z"
        },
        "trusted": true
      },
      "source": [
        "\n",
        "# Prepare model for k-bit training\n",
        "tuned_model = prepare_model_for_kbit_training(tuned_model)\n",
        "\n",
        "# --- 2. Configure LoRA ---\n",
        "lora_config = LoraConfig(\n",
        "    r=16, # LoRA attention dimension\n",
        "    lora_alpha=16, # Alpha parameter for LoRA scaling\n",
        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], # Target all linear layers\n",
        "    lora_dropout=0.05, # Dropout probability for LoRA layers\n",
        "    bias=\"none\", # Only add bias to the LoRA layers\n",
        "    task_type=\"CAUSAL_LM\", # Task type for causal language modeling\n",
        ")\n",
        "\n",
        "# Do NOT call get_peft_model() here — SFTTrainer wraps the model when peft_config is passed.\n",
        "# tuned_model = get_peft_model(tuned_model, lora_config)\n",
        "\n",
        "# --- 3. Prepare a Sample Dataset ---\n",
        "# For a real-world scenario, you would load your own dataset using `load_dataset`\n",
        "# from the `datasets` library and format it appropriately.\n",
        "# This is a simple dummy dataset for demonstration.\n",
        "\n",
        "# Example instruction tuning dataset format\n",
        "data = {\n",
        "    \"text\": [\n",
        "        \"<start_of_turn>user\\nWhat is the capital of France?<end_of_turn>\\n<start_of_turn>model\\nParis is the capital of France.<end_of_turn>\",\n",
        "        \"<start_of_turn>user\\nSuggest a healthy snack.\\n<end_of_turn>\\n<start_of_turn>model\\nAlmonds or a piece of fruit like an apple are great healthy snack options.<end_of_turn>\",\n",
        "        \"<start_of_turn>user\\nExplain the concept of photosynthesis.\\n<end_of_turn>\\n<start_of_turn>model\\nPhotosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy.<end_of_turn>\"\n",
        "    ]\n",
        "}\n",
        "\n",
        "dataset = Dataset.from_dict(data)\n",
        "\n",
        "# --- 4. Define Training Arguments ---\n",
        "from transformers import TrainingArguments\n",
        "\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./gemma_finetuned\", # Output directory for checkpoints and logs\n",
        "    num_train_epochs=1, # Number of training epochs\n",
        "    per_device_train_batch_size=2, # Batch size per GPU/CPU for training\n",
        "    gradient_accumulation_steps=2, # Number of updates steps to accumulate before performing a backward/update pass\n",
        "    optim=\"paged_adamw_8bit\", # Optimizer to use\n",
        "    save_steps=100, # Save checkpoint every X updates steps\n",
        "    logging_steps=10, # Log every X updates steps\n",
        "    learning_rate=2e-4, # Initial learning rate for AdamW optimizer\n",
        "    weight_decay=0.001, # Weight decay for AdamW\n",
        "    fp16=True, # Enable mixed precision training\n",
        "    bf16=False, # Disable BF16 if using FP16\n",
        "    max_grad_norm=0.3, # Max gradient norm\n",
        "    max_steps=-1, # Don't limit training by steps, use epochs\n",
        "    warmup_ratio=0.03, # Ratio of total steps for a linear warmup from 0 to learning_rate\n",
        "    # group_by_length=True, # Group sequences of roughly the same length together to speed up training\n",
        "    lr_scheduler_type=\"constant\", # Learning rate scheduler type\n",
        "    report_to=\"none\" # Disable reporting to any tracking service\n",
        ")\n",
        "\n",
        "# --- 5. Initialize and Run SFTTrainer ---\n",
        "\n",
        "trainer = SFTTrainer(\n",
        "    model=tuned_model,          # plain (non-PEFT) base model\n",
        "    train_dataset=dataset,\n",
        "    peft_config=lora_config,    # SFTTrainer applies LoRA internally\n",
        "    # dataset_text_field=\"text\", # Name of the column containing the text data\n",
        "    # tokenizer=tokenizer,\n",
        "    args=training_args,\n",
        "    # packing=False, # Whether to pack multiple short examples into one longer sequence to improve efficiency\n",
        "    # max_seq_length=512, # Max sequence length to use for training\n",
        ")\n",
        "\n",
        "print(\"Starting finetuning...\")\n",
        "trainer.train()\n",
        "print(\"Finetuning complete!\")\n",
        "\n",
        "# --- 6. (Optional) Save the finetuned model ---\n",
        "# trainer.save_model(\"./gemma_finetuned_model\")\n",
        "\n",
        "# --- 7. (Optional) Merge LoRA adapters with the base model for inference ---\n",
        "# from peft import AutoPeftModelForCausalLM\n",
        "# merged_model = AutoPeftModelForCausalLM.from_pretrained(\n",
        "#     \"./gemma_finetuned_model\",\n",
        "#     device_map=\"auto\",\n",
        "#     torch_dtype=torch.bfloat16 # or torch.float16 depending on your hardware\n",
        "# )\n",
        "# merged_model.save_pretrained(\"gemma_merged_model\", safe_serialization=True)\n",
        "# tokenizer.save_pretrained(\"gemma_merged_model\")\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "trusted": true
      },
      "source": [
        "# Duplicate cell removed — run the finetuning cell above.\n"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Alternative (pick ONE approach, not both):\n",
        "#\n",
        "# Option A — let SFTTrainer apply LoRA (used in the cell above):\n",
        "#   tuned_model = prepare_model_for_kbit_training(tuned_model)\n",
        "#   trainer = SFTTrainer(model=tuned_model, peft_config=lora_config, ...)\n",
        "#\n",
        "# Option B — wrap manually, omit peft_config from SFTTrainer:\n",
        "#   tuned_model = get_peft_model(tuned_model, lora_config)\n",
        "#   trainer = SFTTrainer(model=tuned_model, ...)  # no peft_config\n",
        "\n"
      ],
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "machine_shape": "hm"
    },
    "kaggle": {
      "accelerator": "gpu"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.13"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}