#!/usr/bin/env python3 """ Alpaca Bengali Trainer """ from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer def train_alpaca_model(): print("💬 Training Bengali Instruction Following...") # Load data ds = load_dataset("nihalbaig/alpaca_bangla", split="train") # Initialize model tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") tokenizer.pad_token = tokenizer.eos_token # Prepare data def prepare_data(examples): texts = [] for instruction, output in zip(examples['instruction'], examples['output']): text = f"আদেশ: {instruction}\nউত্তর: {output}\n\n" texts.append(text) return tokenizer(texts, truncation=True, padding=True, max_length=512) tokenized_ds = ds.map(prepare_data, batched=True) # Training training_args = TrainingArguments( output_dir="./bangla_alpaca_model", num_train_epochs=3, per_device_train_batch_size=4, ) trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_ds, ) trainer.train() trainer.save_model() print("✅ Alpaca model trained!") if __name__ == "__main__": train_alpaca_model()