zainabfatima097 commited on
Commit
b6d0c9b
Β·
verified Β·
1 Parent(s): ec79bf7

Upload fine_tune.py

Browse files
Files changed (1) hide show
  1. fine_tune.py +91 -0
fine_tune.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ torch.cuda.empty_cache()
3
+
4
+ # βœ… Clear GPU memory before training
5
+ import torch
6
+ torch.cuda.empty_cache()
7
+
8
+ # βœ… Load necessary libraries
9
+ from datasets import load_dataset
10
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
11
+ import os
12
+
13
+ # βœ… Load Dataset
14
+ dataset = load_dataset("zainabfatima097/My_Dataset") # Change to your dataset path
15
+
16
+ # βœ… Check available splits
17
+ print(f"Available dataset splits: {dataset.keys()}")
18
+
19
+ # βœ… If dataset has only 'validation' split, rename it to 'train'
20
+ if "train" not in dataset:
21
+ dataset["train"] = dataset["validation"]
22
+
23
+ # βœ… Extract Text for Translation Task
24
+ source_lang = "en"
25
+ target_lang = "hi"
26
+
27
+ def preprocess_function(examples):
28
+ """ Extracts input and target texts for translation """
29
+ inputs = [ex[source_lang] for ex in examples["translation"]]
30
+ targets = [ex[target_lang] for ex in examples["translation"]]
31
+ return {"input_text": inputs, "target_text": targets}
32
+
33
+ # βœ… Apply Text Extraction
34
+ dataset = dataset.map(preprocess_function, batched=True)
35
+
36
+ # βœ… Load Tokenizer
37
+ model_checkpoint = "Helsinki-NLP/opus-mt-en-hi" # Use your model
38
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
39
+
40
+ # βœ… Tokenization
41
+ def tokenize_function(examples):
42
+ inputs = tokenizer(examples["input_text"], truncation=True, padding="max_length", max_length=128)
43
+ targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
44
+ inputs["labels"] = targets["input_ids"]
45
+ return inputs
46
+
47
+ # βœ… Apply Tokenization
48
+ tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["translation", "input_text", "target_text"])
49
+
50
+ # βœ… Set Train & Validation Splits
51
+ train_dataset = tokenized_datasets["train"]
52
+ eval_dataset = tokenized_datasets.get("validation", train_dataset) # Use train if validation is missing
53
+
54
+ # βœ… Load Model
55
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
56
+
57
+ # βœ… Training Arguments (Handles Memory Issues)
58
+ training_args = TrainingArguments(
59
+ output_dir="./results",
60
+ per_device_train_batch_size=2, # Reduce batch size to prevent OOM
61
+ per_device_eval_batch_size=2,
62
+ gradient_accumulation_steps=4, # Accumulate gradients to simulate larger batch
63
+ fp16=True, # Mixed precision to reduce memory
64
+ optim="adamw_torch", # More efficient optimizer
65
+ evaluation_strategy="epoch",
66
+ save_strategy="epoch",
67
+ load_best_model_at_end=True,
68
+ push_to_hub=False
69
+ )
70
+
71
+ # βœ… Initialize Trainer
72
+ trainer = Trainer(
73
+ model=model,
74
+ args=training_args,
75
+ train_dataset=train_dataset,
76
+ eval_dataset=eval_dataset,
77
+ tokenizer=tokenizer,
78
+ )
79
+
80
+ # βœ… Train Model (Handling GPU Memory Errors)
81
+ try:
82
+ trainer.train()
83
+ except torch.cuda.OutOfMemoryError:
84
+ print("⚠️ CUDA Out of Memory! Switching to CPU...")
85
+ os.environ["CUDA_VISIBLE_DEVICES"] = "" # Disable GPU
86
+ model.to("cpu")
87
+ trainer.train()
88
+
89
+ # βœ… Save Model
90
+ trainer.save_model("./final_model")
91
+ print("πŸŽ‰ Training complete! Model saved.")