{ "cells": [ { "cell_type": "code", "metadata": { "colab_type": "code", "execution": { "iopub.execute_input": "2026-06-07T13:53:33.681800Z", "iopub.status.busy": "2026-06-07T13:53:33.681410Z", "iopub.status.idle": "2026-06-07T13:53:47.949186Z", "shell.execute_reply": "2026-06-07T13:53:47.948138Z", "shell.execute_reply.started": "2026-06-07T13:53:33.681762Z" }, "trusted": true }, "source": [ "!pip install -U transformers" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "## Local Inference on GPU \n", "Model page: https://huggingface.co/google/gemma-4-E2B-it-qat-mobile-transformers\n", "\n", "⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google/gemma-4-E2B-it-qat-mobile-transformers)\n", "\t\t\tand/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏" ] }, { "cell_type": "code", "metadata": {}, "source": [], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T14:00:05.743785Z", "iopub.status.busy": "2026-06-07T14:00:05.742974Z", "iopub.status.idle": "2026-06-07T14:00:05.748408Z", "shell.execute_reply": "2026-06-07T14:00:05.747778Z", "shell.execute_reply.started": "2026-06-07T14:00:05.743756Z" }, "trusted": true }, "source": [ "# Load model directly\n", "from transformers import AutoModel\n", "import torch\n", "\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "model_name_gemma=\"google/gemma-4-E2B-it-qat-mobile-transformers\"\n", "model_name=\"openbmb/MiniCPM5-1B\"\n", "model_openbmb=\"openbmb/MiniCPM5-1B\"" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab_type": "code", "execution": { "iopub.execute_input": "2026-06-07T13:58:21.771701Z", "iopub.status.busy": "2026-06-07T13:58:21.770950Z", "iopub.status.idle": "2026-06-07T13:58:42.868203Z", "shell.execute_reply": "2026-06-07T13:58:42.867564Z", "shell.execute_reply.started": "2026-06-07T13:58:21.771669Z" }, "trusted": true }, "source": [ "\n", "model = AutoModel.from_pretrained(model_name, dtype=\"auto\").to(device)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T14:00:17.480009Z", "iopub.status.busy": "2026-06-07T14:00:17.479002Z", "iopub.status.idle": "2026-06-07T14:00:28.838734Z", "shell.execute_reply": "2026-06-07T14:00:28.837707Z", "shell.execute_reply.started": "2026-06-07T14:00:17.479974Z" }, "trusted": true }, "source": [ "\n", "from transformers import AutoTokenizer, AutoModelForCausalLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"openbmb/MiniCPM5-1B\")\n", "model = AutoModelForCausalLM.from_pretrained(\"openbmb/MiniCPM5-1B\")\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T14:00:51.808940Z", "iopub.status.busy": "2026-06-07T14:00:51.808046Z", "iopub.status.idle": "2026-06-07T14:00:59.481653Z", "shell.execute_reply": "2026-06-07T14:00:59.480980Z", "shell.execute_reply.started": "2026-06-07T14:00:51.808892Z" }, "trusted": true }, "source": [ "messages = [\n", " {\"role\": \"user\", \"content\": \"Who are you?\"},\n", "]\n", "inputs = tokenizer.apply_chat_template(\n", "\tmessages,\n", "\tadd_generation_prompt=True,\n", "\ttokenize=True,\n", "\treturn_dict=True,\n", "\treturn_tensors=\"pt\",\n", ").to(model.device)\n", "\n", "outputs = model.generate(**inputs, max_new_tokens=120)\n", "print(tokenizer.decode(outputs[0][inputs[\"input_ids\"].shape[-1]:]))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T13:59:02.169453Z", "iopub.status.busy": "2026-06-07T13:59:02.168479Z", "iopub.status.idle": "2026-06-07T13:59:04.394628Z", "shell.execute_reply": "2026-06-07T13:59:04.393301Z", "shell.execute_reply.started": "2026-06-07T13:59:02.169418Z" }, "trusted": true }, "source": [ "# from transformers import AutoTokenizer, pipeline\n", "\n", "# tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "\n", "# pipeline = pipeline(\n", "# \"text-generation\",\n", "# model=model,\n", "# tokenizer=tokenizer,\n", "# model_kwargs={\"torch_dtype\": \"auto\"},\n", "# )\n", "\n", "# # Example of generating chat/text\n", "# messages = [\n", "# {\"role\": \"user\", \"content\": \"What is your favorite color?\"},\n", "# ]\n", "\n", "# prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n", "# outputs = pipeline(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)\n", "# print(outputs[0][\"generated_text\"])" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Finetuning" ] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T14:01:09.304174Z", "iopub.status.busy": "2026-06-07T14:01:09.303286Z", "iopub.status.idle": "2026-06-07T14:01:32.898893Z", "shell.execute_reply": "2026-06-07T14:01:32.897942Z", "shell.execute_reply.started": "2026-06-07T14:01:09.304140Z" }, "trusted": true }, "source": [ "# Install necessary libraries for finetuning (if not already installed)\n", "!pip install -q -U accelerate peft bitsandbytes transformers trl datasets\n", "\n", "import torch\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig\n", "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n", "from trl import SFTTrainer\n", "from datasets import Dataset\n", "\n", "# --- 1. Load Model and Tokenizer for Finetuning with QLoRA ---\n", "# The `model` variable from a previous cell is AutoModel, for finetuning\n", "# we typically need AutoModelForCausalLM with BitsAndBytesConfig.\n", "# Let's reload it for clarity in this finetuning example.\n", "\n", "# model_id = \"google/gemma-4-E2B-it-qat-mobile-transformers\"\n", "model_id = model_openbmb\n", "\n", "# Configure 4-bit quantization\n", "bnb_config = BitsAndBytesConfig(\n", " load_in_4bit=True,\n", " bnb_4bit_quant_type=\"nf4\",\n", " bnb_4bit_compute_dtype=torch.bfloat16,\n", " bnb_4bit_use_double_quant=False,\n", ")\n", "\n", "# Load the model with 4-bit quantization\n", "# Assuming 'model' from earlier cell is a suitable base for this if not reloaded\n", "# If you've already loaded it as `AutoModel`, you might need to ensure it's `AutoModelForCausalLM`\n", "# and can be quantizied. For a robust finetuning example, it's safer to load it here again.\n", "\n", "tuned_model = AutoModelForCausalLM.from_pretrained(\n", " model_id,\n", " quantization_config=bnb_config,\n", " device_map=\"auto\",\n", ")\n", "\n", "tuned_model.config.use_cache = False\n", "tuned_model.config.pretraining_tp = 1\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)\n", "tokenizer.pad_token = tokenizer.eos_token\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": {}, "source": [ "\n", "# Prepare model for k-bit training\n", "tuned_model = prepare_model_for_kbit_training(tuned_model)\n", "\n", "# --- 2. Configure LoRA ---\n", "lora_config = LoraConfig(\n", " r=16, # LoRA attention dimension\n", " lora_alpha=16, # Alpha parameter for LoRA scaling\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], # Target all linear layers\n", " lora_dropout=0.05, # Dropout probability for LoRA layers\n", " bias=\"none\", # Only add bias to the LoRA layers\n", " task_type=\"CAUSAL_LM\", # Task type for causal language modeling\n", ")\n", "\n", "# Do NOT call get_peft_model() here — SFTTrainer wraps the model when peft_config is passed.\n", "# tuned_model = get_peft_model(tuned_model, lora_config)\n", "\n", "# --- 3. Prepare a Sample Dataset ---\n", "# For a real-world scenario, you would load your own dataset using `load_dataset`\n", "# from the `datasets` library and format it appropriately.\n", "# This is a simple dummy dataset for demonstration.\n", "\n", "# Example instruction tuning dataset format\n", "data = {\n", " \"text\": [\n", " \"user\\nWhat is the capital of France?\\nmodel\\nParis is the capital of France.\",\n", " \"user\\nSuggest a healthy snack.\\n\\nmodel\\nAlmonds or a piece of fruit like an apple are great healthy snack options.\",\n", " \"user\\nExplain the concept of photosynthesis.\\n\\nmodel\\nPhotosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy.\"\n", " ]\n", "}\n", "\n", "dataset = Dataset.from_dict(data)\n", "\n", "# --- 4. Define Training Arguments ---\n", "from transformers import TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./gemma_finetuned\", # Output directory for checkpoints and logs\n", " num_train_epochs=1, # Number of training epochs\n", " per_device_train_batch_size=2, # Batch size per GPU/CPU for training\n", " gradient_accumulation_steps=2, # Number of updates steps to accumulate before performing a backward/update pass\n", " optim=\"paged_adamw_8bit\", # Optimizer to use\n", " save_steps=100, # Save checkpoint every X updates steps\n", " logging_steps=10, # Log every X updates steps\n", " learning_rate=2e-4, # Initial learning rate for AdamW optimizer\n", " weight_decay=0.001, # Weight decay for AdamW\n", " fp16=False, # Must match bnb_4bit_compute_dtype (bf16 below)\n", " bf16=True, # Use bf16 when bnb_4bit_compute_dtype=torch.bfloat16\n", " max_grad_norm=0.3, # Max gradient norm\n", " max_steps=-1, # Don't limit training by steps, use epochs\n", " warmup_ratio=0.03, # Ratio of total steps for a linear warmup from 0 to learning_rate\n", " # group_by_length=True, # Group sequences of roughly the same length together to speed up training\n", " lr_scheduler_type=\"constant\", # Learning rate scheduler type\n", " report_to=\"none\" # Disable reporting to any tracking service\n", ")\n", "\n", "# --- 5. Initialize and Run SFTTrainer ---\n", "\n", "trainer = SFTTrainer(\n", " model=tuned_model, # plain (non-PEFT) base model\n", " train_dataset=dataset,\n", " peft_config=lora_config, # SFTTrainer applies LoRA internally\n", " # dataset_text_field=\"text\", # Name of the column containing the text data\n", " # tokenizer=tokenizer,\n", " args=training_args,\n", " # packing=False, # Whether to pack multiple short examples into one longer sequence to improve efficiency\n", " # max_seq_length=512, # Max sequence length to use for training\n", ")\n", "\n", "print(\"Starting finetuning...\")\n", "trainer.train()\n", "print(\"Finetuning complete!\")\n", "\n", "# --- 6. Save the LoRA adapter ---\n", "trainer.model.save_pretrained(\"./gemma_finetuned_model\")\n", "tokenizer.save_pretrained(\"./gemma_finetuned_model\")\n", "\n", "# --- 7. (Optional) Merge LoRA adapters for inference ---\n", "# Merge in-memory from the trained model (avoids AutoPeft reload + torchao version issues).\n", "merged_model = trainer.model.merge_and_unload()\n", "merged_model.save_pretrained(\"gemma_merged_model\", safe_serialization=True)\n", "tokenizer.save_pretrained(\"gemma_merged_model\")\n", "\n", "# If you need to reload the adapter from disk later instead, upgrade torchao first:\n", "# !pip install -U \"torchao>=0.16.0\"\n", "# from peft import PeftModel\n", "# base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map=\"auto\")\n", "# peft_model = PeftModel.from_pretrained(base_model, \"./gemma_finetuned_model\")\n", "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "execution": { "iopub.execute_input": "2026-06-07T14:02:33.440153Z", "iopub.status.busy": "2026-06-07T14:02:33.439372Z", "iopub.status.idle": "2026-06-07T14:02:36.463743Z", "shell.execute_reply": "2026-06-07T14:02:36.462765Z", "shell.execute_reply.started": "2026-06-07T14:02:33.440118Z" }, "trusted": true }, "source": [ "\n", "# Prepare model for k-bit training\n", "tuned_model = prepare_model_for_kbit_training(tuned_model)\n", "\n", "# --- 2. Configure LoRA ---\n", "lora_config = LoraConfig(\n", " r=16, # LoRA attention dimension\n", " lora_alpha=16, # Alpha parameter for LoRA scaling\n", " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"], # Target all linear layers\n", " lora_dropout=0.05, # Dropout probability for LoRA layers\n", " bias=\"none\", # Only add bias to the LoRA layers\n", " task_type=\"CAUSAL_LM\", # Task type for causal language modeling\n", ")\n", "\n", "# Do NOT call get_peft_model() here — SFTTrainer wraps the model when peft_config is passed.\n", "# tuned_model = get_peft_model(tuned_model, lora_config)\n", "\n", "# --- 3. Prepare a Sample Dataset ---\n", "# For a real-world scenario, you would load your own dataset using `load_dataset`\n", "# from the `datasets` library and format it appropriately.\n", "# This is a simple dummy dataset for demonstration.\n", "\n", "# Example instruction tuning dataset format\n", "data = {\n", " \"text\": [\n", " \"user\\nWhat is the capital of France?\\nmodel\\nParis is the capital of France.\",\n", " \"user\\nSuggest a healthy snack.\\n\\nmodel\\nAlmonds or a piece of fruit like an apple are great healthy snack options.\",\n", " \"user\\nExplain the concept of photosynthesis.\\n\\nmodel\\nPhotosynthesis is the process by which green plants and some other organisms convert light energy into chemical energy.\"\n", " ]\n", "}\n", "\n", "dataset = Dataset.from_dict(data)\n", "\n", "# --- 4. Define Training Arguments ---\n", "from transformers import TrainingArguments\n", "\n", "training_args = TrainingArguments(\n", " output_dir=\"./gemma_finetuned\", # Output directory for checkpoints and logs\n", " num_train_epochs=1, # Number of training epochs\n", " per_device_train_batch_size=2, # Batch size per GPU/CPU for training\n", " gradient_accumulation_steps=2, # Number of updates steps to accumulate before performing a backward/update pass\n", " optim=\"paged_adamw_8bit\", # Optimizer to use\n", " save_steps=100, # Save checkpoint every X updates steps\n", " logging_steps=10, # Log every X updates steps\n", " learning_rate=2e-4, # Initial learning rate for AdamW optimizer\n", " weight_decay=0.001, # Weight decay for AdamW\n", " fp16=True, # Enable mixed precision training\n", " bf16=False, # Disable BF16 if using FP16\n", " max_grad_norm=0.3, # Max gradient norm\n", " max_steps=-1, # Don't limit training by steps, use epochs\n", " warmup_ratio=0.03, # Ratio of total steps for a linear warmup from 0 to learning_rate\n", " # group_by_length=True, # Group sequences of roughly the same length together to speed up training\n", " lr_scheduler_type=\"constant\", # Learning rate scheduler type\n", " report_to=\"none\" # Disable reporting to any tracking service\n", ")\n", "\n", "# --- 5. Initialize and Run SFTTrainer ---\n", "\n", "trainer = SFTTrainer(\n", " model=tuned_model, # plain (non-PEFT) base model\n", " train_dataset=dataset,\n", " peft_config=lora_config, # SFTTrainer applies LoRA internally\n", " # dataset_text_field=\"text\", # Name of the column containing the text data\n", " # tokenizer=tokenizer,\n", " args=training_args,\n", " # packing=False, # Whether to pack multiple short examples into one longer sequence to improve efficiency\n", " # max_seq_length=512, # Max sequence length to use for training\n", ")\n", "\n", "print(\"Starting finetuning...\")\n", "trainer.train()\n", "print(\"Finetuning complete!\")\n", "\n", "# --- 6. (Optional) Save the finetuned model ---\n", "# trainer.save_model(\"./gemma_finetuned_model\")\n", "\n", "# --- 7. (Optional) Merge LoRA adapters with the base model for inference ---\n", "# from peft import AutoPeftModelForCausalLM\n", "# merged_model = AutoPeftModelForCausalLM.from_pretrained(\n", "# \"./gemma_finetuned_model\",\n", "# device_map=\"auto\",\n", "# torch_dtype=torch.bfloat16 # or torch.float16 depending on your hardware\n", "# )\n", "# merged_model.save_pretrained(\"gemma_merged_model\", safe_serialization=True)\n", "# tokenizer.save_pretrained(\"gemma_merged_model\")\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "trusted": true }, "source": [ "# Duplicate cell removed — run the finetuning cell above.\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": {}, "source": [ "# Alternative (pick ONE approach, not both):\n", "#\n", "# Option A — let SFTTrainer apply LoRA (used in the cell above):\n", "# tuned_model = prepare_model_for_kbit_training(tuned_model)\n", "# trainer = SFTTrainer(model=tuned_model, peft_config=lora_config, ...)\n", "#\n", "# Option B — wrap manually, omit peft_config from SFTTrainer:\n", "# tuned_model = get_peft_model(tuned_model, lora_config)\n", "# trainer = SFTTrainer(model=tuned_model, ...) # no peft_config\n", "\n" ], "execution_count": null, "outputs": [] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "machine_shape": "hm" }, "kaggle": { "accelerator": "gpu" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.13" } }, "nbformat": 4, "nbformat_minor": 4 }