#!/usr/bin/env python3 """ Production Bengali Math AI Training Script For actual model training and deployment """ from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling ) import torch def main(): print("🇧🇩 PRODUCTION BANGLI MATH AI TRAINING") print("=" * 40) # Load dataset print("đŸ“Ĩ Loading full dataset...") ds = load_dataset("hamim-87/Ashrafur_bangla_math", split="train") # Use larger sample for training train_size = min(50000, len(ds)) # Use up to 50k examples ds = ds.select(range(train_size)) print(f"✅ Using {len(ds)} examples for training") # Initialize model print("🤖 Initializing model...") # Use appropriate model for Bengali model_name = "microsoft/DialoGPT-medium" # or other compatible model tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Set pad token tokenizer.pad_token = tokenizer.eos_token # Prepare data print("🔧 Preparing training data...") def prepare_data(examples): texts = [] for problem, solution in zip(examples['problem'], examples['solution']): text = f"āĻĒā§āϰāĻļā§āύ: {problem}\n\nāωāĻ¤ā§āϤāϰ: {solution}\n\n" texts.append(text) return {"text": texts} dataset = ds.map(prepare_data, batched=True) # Tokenize def tokenize_function(examples): return tokenizer( examples["text"], truncation=True, padding=True, max_length=512 ) tokenized_dataset = dataset.map(tokenize_function, batched=True) # Data collator data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=False, ) # Training arguments training_args = TrainingArguments( output_dir="./bangla_math_ai_model", num_train_epochs=3, per_device_train_batch_size=4, per_device_eval_batch_size=4, warmup_steps=1000, weight_decay=0.01, logging_dir="./logs", logging_steps=100, evaluation_strategy="steps", eval_steps=1000, save_steps=2000, load_best_model_at_end=True, metric_for_best_model="loss", greater_is_better=False, fp16=True if torch.cuda.is_available() else False, ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_dataset, eval_dataset=tokenized_dataset.select(range(1000)), # Small eval set data_collator=data_collator, ) # Train print("🎓 Starting training...") trainer.train() # Save model trainer.save_model() tokenizer.save_pretrained("./bangla_math_ai_model") print("✅ Training completed and model saved!") # Test generation print("đŸ§Ē Testing model...") test_problem = "5 āϜāύ āĻ›āĻžāĻ¤ā§āϰ 3āϟāĻŋ āϖ⧇āϞāĻžāϝāĻŧ āĻ…āĻ‚āĻļāĻ—ā§āϰāĻšāĻŖ āĻ•āϰāϤ⧇ āϚāĻžāϝāĻŧ..." input_text = f"āĻĒā§āϰāĻļā§āύ: {test_problem}\n\nāωāĻ¤ā§āϤāϰ:" input_ids = tokenizer.encode(input_text, return_tensors="pt") with torch.no_grad(): outputs = model.generate( input_ids, max_length=200, num_return_sequences=1, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print(f"Generated: {response}") if __name__ == "__main__": main()