| |
| """ |
| Alpaca Bengali Trainer |
| """ |
| from datasets import load_dataset |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer |
|
|
| def train_alpaca_model(): |
| print("💬 Training Bengali Instruction Following...") |
| |
| |
| ds = load_dataset("nihalbaig/alpaca_bangla", split="train") |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") |
| model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium") |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| def prepare_data(examples): |
| texts = [] |
| for instruction, output in zip(examples['instruction'], examples['output']): |
| text = f"আদেশ: {instruction}\nউত্তর: {output}\n\n" |
| texts.append(text) |
| return tokenizer(texts, truncation=True, padding=True, max_length=512) |
| |
| tokenized_ds = ds.map(prepare_data, batched=True) |
| |
| |
| training_args = TrainingArguments( |
| output_dir="./bangla_alpaca_model", |
| num_train_epochs=3, |
| per_device_train_batch_size=4, |
| ) |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=tokenized_ds, |
| ) |
| |
| trainer.train() |
| trainer.save_model() |
| print("✅ Alpaca model trained!") |
|
|
| if __name__ == "__main__": |
| train_alpaca_model() |
|
|