# -*- coding: utf-8 -*- """hybird_constrinc_samer_n_lex_reg (82.9).ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/17rYEHl4vHDcV_mt-GgizkBfTPgAJn5l1 # Cell 2: Connect to Google Drive """ # Authorize Colab to access your Google Drive from google.colab import drive drive.mount('/content/drive') """# Define All Paths on Google Drive""" # ===================================================================================== # 1. IMPORTS & CONFIGURATION # ===================================================================================== import pandas as pd import numpy as np import os import torch import torch.nn as nn import zipfile from sklearn.metrics import cohen_kappa_score from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoTokenizer, AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback ) import gc import ast # To safely evaluate string-formatted lists # --- Model & Training --- MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" NUM_LABELS = 1 TARGET_CLASSES = 19 NUM_FEATURES = 7 # --- IMPORTANT: Set the path to your project folder on Google Drive --- PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition' # --- File & Directory Paths (Now relative to your Google Drive) --- BASE_DIR = PROJECT_DRIVE_PATH PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex") CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_regression_{MODEL_NAME.split('/')[-1]}") SUBMISSION_DIR = os.path.join(BASE_DIR, "submission") # Ensure the output directories exist on your Google Drive os.makedirs(CHECKPOINT_DIR, exist_ok=True) os.makedirs(SUBMISSION_DIR, exist_ok=True) # Paths to the preprocessed input files on Google Drive TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv') DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv') TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv') # --- Submission Paths on Google Drive --- SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_regression.csv") ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_regression.zip") print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}") """# Data Loading and Helper Functions""" # ===================================================================================== # 2. DATA LOADING FUNCTION # ===================================================================================== def load_preprocessed_data(): """Loads the pre-processed CSV files directly from Google Drive.""" print("\n--- Loading Preprocessed Data from Google Drive ---") try: train_df = pd.read_csv(TRAIN_PROCESSED_PATH) val_df = pd.read_csv(DEV_PROCESSED_PATH) print("Converting 'features' column from string to list...") # This can be slow; using apply is fine here. train_df['features'] = train_df['features'].apply(ast.literal_eval) val_df['features'] = val_df['features'].apply(ast.literal_eval) # Convert labels for regression train_df['label'] = (train_df['label'].astype(int) - 1).astype(float) val_df['label'] = (val_df['label'].astype(int) - 1).astype(float) print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.") return train_df, val_df except FileNotFoundError as e: print(f"❌ ERROR: Preprocessed file not found: {e}.") print("Please make sure your data is uploaded to the correct Google Drive folder.") return None, None except Exception as e: print(f"❌ ERROR during data loading: {e}") return None, None # ===================================================================================== # 3. DATASET, MODEL & METRICS DEFINITIONS # ===================================================================================== class ReadabilityDataset(TorchDataset): """ Custom PyTorch Dataset for handling texts, numerical features, and labels. """ def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) # Tokenize the text inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_token_type_ids=True ) # Create dictionary of tensors item = { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'features': feature_vec } # Add labels if they exist (for training and validation) if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item class HybridRegressionModel(nn.Module): """ A hybrid model that combines a pre-trained transformer with a regression head that accepts additional numerical features. """ def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) self._keys_to_ignore_on_save = [] # --- FIX: The layer must be named 'head' to match the saved checkpoint file --- self.head = nn.Sequential( nn.Linear(self.transformer.config.hidden_size + num_extra_features, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, NUM_LABELS) ) def forward(self, input_ids, attention_mask, features, labels=None): transformer_outputs = self.transformer( input_ids=input_ids, attention_mask=attention_mask ) cls_token_output = transformer_outputs.last_hidden_state[:, 0, :] combined_features = torch.cat([cls_token_output, features], dim=1) # --- FIX: Use 'self.head' here as well --- logits = self.head(combined_features) loss = None if labels is not None: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) return (loss, logits) if loss is not None else logits def compute_metrics(p): """ Computes the Quadratic Weighted Kappa (QWK) score for regression predictions. """ # Extract predictions and true labels preds = p.predictions.flatten() labels = p.label_ids # Round predictions to the nearest integer and clip to the valid label range clipped_preds = np.clip(np.round(preds), 0, TARGET_CLASSES - 1) # Calculate QWK score qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic') # Calculate Mean Squared Error mse = ((preds - labels) ** 2).mean() return { "qwk": qwk, "mse": mse } """# Initialize Model and Train""" # ===================================================================================== # 4. & 5. MAIN EXECUTION FUNCTIONS # ===================================================================================== import json # Added import for json def main_train(): print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n") print("Initializing Tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) train_df, val_df = load_preprocessed_data() if train_df is None: print("\n! Aborting script due to data loading failure.") return print("\nCreating Torch Datasets...") train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer) val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer) print("✔ Datasets created.") print("\nInitializing Hybrid Regression Model...") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) # Training arguments are set to work well on Colab training_args = TrainingArguments( output_dir=CHECKPOINT_DIR, num_train_epochs=8, per_device_train_batch_size=16, # Adjust if you face OOM errors per_device_eval_batch_size=64, learning_rate=3e-5, warmup_ratio=0.1, weight_decay=0.01, logging_steps=100, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="qwk", greater_is_better=True, save_total_limit=2, fp16=torch.cuda.is_available(), # Automatically uses mixed precision on GPU report_to="none" # Disables wandb or other reporting ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] ) print("\nStarting model training... Checkpoints will be saved to Google Drive.") # Check for latest checkpoint to resume from latest_checkpoint = None if os.path.exists(CHECKPOINT_DIR): checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")] if checkpoints: checkpoints.sort(key=lambda x: int(x.split('-')[-1])) latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1]) print(f"Resuming training from latest checkpoint: {latest_checkpoint}") else: print("No checkpoints found to resume training from. Starting from scratch.") else: print("Checkpoint directory not found. Starting training from scratch.") trainer.train(resume_from_checkpoint=latest_checkpoint) print("✔ Training finished.") # Clean up memory del model, trainer, train_dataset, val_dataset, train_df, val_df gc.collect() torch.cuda.empty_cache() def main_predict(): print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n") try: print("Initializing Tokenizer for prediction...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print("Loading preprocessed test data from Google Drive...") test_df = pd.read_csv(TEST_PROCESSED_PATH) test_df['features'] = test_df['features'].apply(ast.literal_eval) print("\nLoading the best trained model from Google Drive checkpoints...") if not os.path.exists(CHECKPOINT_DIR): raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.") checkpoints = sorted( [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")], key=lambda x: int(x.split('-')[-1]) ) if not checkpoints: raise FileNotFoundError(f"No checkpoint folders found in {CHECKPOINT_DIR}.") # Find the latest valid checkpoint directory latest_checkpoint_dir = os.path.join(CHECKPOINT_DIR, checkpoints[-1]) print(f"Using latest checkpoint directory: {latest_checkpoint_dir}") # --- FIX: Check for 'model.safetensors' first, then 'pytorch_model.bin' --- model_path_safetensors = os.path.join(latest_checkpoint_dir, "model.safetensors") model_path_bin = os.path.join(latest_checkpoint_dir, "pytorch_model.bin") state_dict = None if os.path.exists(model_path_safetensors): print(f"Found 'model.safetensors', loading weights...") state_dict = load_file(model_path_safetensors) elif os.path.exists(model_path_bin): print(f"Found 'pytorch_model.bin', loading weights...") state_dict = torch.load(model_path_bin) else: raise FileNotFoundError(f"Neither 'model.safetensors' nor 'pytorch_model.bin' found in {latest_checkpoint_dir}") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) model.load_state_dict(state_dict) trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR)) print("Generating predictions on the test set...") test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer) predictions = trainer.predict(test_dataset) clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1) test_df['Prediction'] = (clipped_preds + 1).astype(int) submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']] print(f"\nSaving prediction file to: {SUBMISSION_PATH}") submission_df.to_csv(SUBMISSION_PATH, index=False) print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...") with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf: zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH)) print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!") except FileNotFoundError as e: print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.") except Exception as e: print(f"❌ An error occurred during final prediction: {e}") """# Final Prediction and Save Submission to Drive""" # ===================================================================================== # 6. SCRIPT RUNNER # ===================================================================================== # Start the training process main_train() # Once training is done, generate predictions main_predict() print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---") # ===================================================================================== # 1. IMPORTS # ===================================================================================== import pandas as pd import numpy as np import os import torch import torch.nn as nn import zipfile import ast from safetensors.torch import load_file from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoTokenizer, AutoModel, TrainingArguments, Trainer, ) import gc # ===================================================================================== # 2. CONFIGURATION # ===================================================================================== # --- Model & Data Specs --- MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" NUM_LABELS = 1 TARGET_CLASSES = 19 NUM_FEATURES = 7 # --- Path to your specific checkpoint --- # This is the full path to the checkpoint folder you want to use for predictions. CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944" # --- Other Important Paths --- PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition' TEST_PROCESSED_PATH = os.path.join(PROJECT_DRIVE_PATH, "lex", 'test_processed_full.csv') SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission") # --- Submission File Names --- SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "standalone_submission.csv") ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "standalone_submission.zip") # Ensure the submission directory exists os.makedirs(SUBMISSION_DIR, exist_ok=True) print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}") # ===================================================================================== # 3. REQUIRED CLASS DEFINITIONS # These classes MUST be defined so Python knows the structure of your model and data. # ===================================================================================== class ReadabilityDataset(TorchDataset): """Custom PyTorch Dataset for handling texts, features, and labels.""" def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_token_type_ids=True ) item = { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'features': feature_vec } if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item class HybridRegressionModel(nn.Module): """The architecture of your saved model. This must match the training script.""" def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) self._keys_to_ignore_on_save = [] self.head = nn.Sequential( nn.Linear(self.transformer.config.hidden_size + num_extra_features, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, NUM_LABELS) ) def forward(self, input_ids, attention_mask, features, labels=None): transformer_outputs = self.transformer( input_ids=input_ids, attention_mask=attention_mask ) cls_token_output = transformer_outputs.last_hidden_state[:, 0, :] combined_features = torch.cat([cls_token_output, features], dim=1) logits = self.head(combined_features) loss = None if labels is not None: loss_fct = nn.MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) return (loss, logits) if loss is not None else logits # ===================================================================================== # 4. PREDICTION LOGIC # ===================================================================================== def generate_predictions(checkpoint_path): """Loads the model and runs predictions on the test set.""" print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n") try: # --- 1. Load Tokenizer and Test Data --- print("Initializing Tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) print(f"Loading preprocessed test data from: {TEST_PROCESSED_PATH}") test_df = pd.read_csv(TEST_PROCESSED_PATH) test_df['features'] = test_df['features'].apply(ast.literal_eval) print(f"✔ Successfully loaded {len(test_df)} test records.") # --- 2. Load Model Weights --- print(f"\nLoading model from checkpoint: {checkpoint_path}") model_weights_path = os.path.join(checkpoint_path, "model.safetensors") if not os.path.exists(model_weights_path): raise FileNotFoundError(f"Could not find 'model.safetensors' in the specified checkpoint directory.") # Initialize the model architecture model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) # Load the saved weights into the model state_dict = load_file(model_weights_path) model.load_state_dict(state_dict) print("✔ Model weights loaded successfully.") # --- 3. Run Prediction --- # The Trainer is a convenient way to run predictions in batches trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=64)) print("\nGenerating predictions on the test set...") test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer) raw_predictions = trainer.predict(test_dataset) print("✔ Predictions generated.") # --- 4. Process and Save Submission File --- # Clip predictions to the valid range [0, 18] and round to the nearest integer clipped_preds = np.clip(np.round(raw_predictions.predictions.flatten()), 0, TARGET_CLASSES - 1) # Add 1 to convert from 0-18 range to 1-19 range for the final label test_df['Prediction'] = (clipped_preds + 1).astype(int) # Prepare the submission file in the required format submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']] print(f"\nSaving prediction file to: {SUBMISSION_PATH}") submission_df.to_csv(SUBMISSION_PATH, index=False) print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...") with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf: zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH)) print(f"\n--- ✅ SUCCESS! Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created in your Drive! ---") except FileNotFoundError as e: print(f"❌ ERROR: A required file was not found. Please check your paths.") print(e) except Exception as e: print(f"❌ An unexpected error occurred: {e}") finally: # Clean up memory del model, trainer, test_dataset gc.collect() torch.cuda.empty_cache() # ===================================================================================== # 5. EXECUTE THE SCRIPT # ===================================================================================== if __name__ == "__main__": generate_predictions(CHECKPOINT_PATH) # ===================================================================================== # 1. IMPORTS # ===================================================================================== import pandas as pd import numpy as np import os import torch import torch.nn as nn import ast from torch.utils.data import Dataset as TorchDataset from sklearn.metrics import cohen_kappa_score from transformers import ( AutoTokenizer, AutoModel, TrainingArguments, Trainer, ) import gc # This import is crucial for loading the model file from safetensors.torch import load_file # ===================================================================================== # 2. CONFIGURATION # ===================================================================================== MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" NUM_LABELS = 1 TARGET_CLASSES = 19 NUM_FEATURES = 7 CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944" EVAL_DATA_PATH = "/content/drive/MyDrive/BAREC_Competition/lex/dev_processed_full.csv" print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}") # ===================================================================================== # 3. CORRECT MODEL ARCHITECTURE AND HELPERS # ===================================================================================== class HybridRegressionModel(nn.Module): """ This is the correct, complex architecture from your notebook that matches checkpoint-48944. """ def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) transformer_output_dim = self.transformer.config.hidden_size self.head = nn.Sequential( nn.Linear(transformer_output_dim + num_extra_features, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3), nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, 1) ) def forward(self, input_ids, attention_mask, features, labels=None): transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) cls_embedding = transformer_outputs.last_hidden_state[:, 0, :] combined_features = torch.cat([cls_embedding, features], dim=1) logits = self.head(combined_features).squeeze(-1) if labels is not None: loss_fn = nn.MSELoss() loss = loss_fn(logits, labels.float()) return (loss, logits) return logits class ReadabilityDataset(TorchDataset): """Custom Dataset to format data for the model.""" def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_token_type_ids=True ) # Rename 'numerical_features' to 'features' to match the model's forward pass item = { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'features': feature_vec } if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item def compute_metrics(p): """Computes QWK and MSE for evaluation.""" preds = p.predictions.flatten() labels = p.label_ids clipped_preds = np.clip(np.round(preds), 0, TARGET_CLASSES - 1) qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic') mse = ((preds - labels) ** 2).mean() return {"qwk": qwk, "mse": mse} # ===================================================================================== # 4. STANDALONE EVALUATION LOGIC # ===================================================================================== def evaluate_checkpoint(checkpoint_path): print("\n===== 🚀 STARTING EVALUATION PIPELINE =====\n") try: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) eval_df = pd.read_csv(EVAL_DATA_PATH) eval_df['features'] = eval_df['features'].apply(ast.literal_eval) eval_df['label'] = (eval_df['label'].astype(int) - 1).astype(float) print(f"✔ Successfully loaded {len(eval_df)} evaluation records.") # Correctly load the model.safetensors file model_weights_path = os.path.join(checkpoint_path, "model.safetensors") if not os.path.exists(model_weights_path): raise FileNotFoundError(f"'model.safetensors' not found in {checkpoint_path}") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) state_dict = load_file(model_weights_path) model.load_state_dict(state_dict) print("✔ Model weights loaded successfully.") trainer = Trainer( model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=64), compute_metrics=compute_metrics, ) print("\nRunning evaluation...") eval_dataset = ReadabilityDataset( eval_df['d3tok_text'].tolist(), eval_df['features'].tolist(), eval_df['label'].tolist(), tokenizer_obj=tokenizer ) results = trainer.evaluate(eval_dataset) print("\n--- ✅ EVALUATION COMPLETE ---") print(f"Quadratic Weighted Kappa (QWK): {results.get('eval_qwk'):.4f}") print(f"Mean Squared Error (MSE): {results.get('eval_mse'):.4f}") print("---------------------------------") except Exception as e: print(f"❌ An error occurred: {e}") finally: gc.collect() torch.cuda.empty_cache() # ===================================================================================== # 5. EXECUTE THE SCRIPT # ===================================================================================== evaluate_checkpoint(CHECKPOINT_PATH) # ===================================================================================== # 1. IMPORTS # ===================================================================================== import pandas as pd import numpy as np import os import torch import torch.nn as nn import ast from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoTokenizer, AutoModel, TrainingArguments, Trainer, ) import gc # This import is crucial for loading the model file from safetensors.torch import load_file # ===================================================================================== # 2. CONFIGURATION # ===================================================================================== MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" TARGET_CLASSES = 19 NUM_FEATURES = 7 CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944" # Path to the blind test set TEST_DATA_PATH = "/content/drive/MyDrive/BAREC_Competition/lex/test_processed_full.csv" # Path for the final output file PREDICTION_FILE_PATH = "/content/drive/MyDrive/BAREC_Competition/submission/test_predictions.csv" print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}") print(f"✔️ Predicting on data from: {TEST_DATA_PATH}") # ===================================================================================== # 3. CORRECT MODEL ARCHITECTURE AND DATASET CLASS # ===================================================================================== class HybridRegressionModel(nn.Module): """ This is the correct, complex architecture from your notebook that matches checkpoint-48944. """ def __init__(self, model_name, num_extra_features): super(HybridRegressionModel, self).__init__() self.transformer = AutoModel.from_pretrained(model_name) transformer_output_dim = self.transformer.config.hidden_size self.head = nn.Sequential( nn.Linear(transformer_output_dim + num_extra_features, 512), nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3), nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, 1) ) def forward(self, input_ids, attention_mask, features, labels=None): transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask) cls_embedding = transformer_outputs.last_hidden_state[:, 0, :] combined_features = torch.cat([cls_embedding, features], dim=1) logits = self.head(combined_features).squeeze(-1) if labels is not None: loss_fn = nn.MSELoss() loss = loss_fn(logits, labels.float()) return (loss, logits) return logits class ReadabilityDataset(TorchDataset): """Custom Dataset to format data for the model.""" def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256): self.texts = texts self.features = features self.labels = labels self.tokenizer = tokenizer_obj self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) feature_vec = torch.tensor(self.features[idx], dtype=torch.float) inputs = self.tokenizer.encode_plus( text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True, return_token_type_ids=True ) # Rename to 'features' to match the model's forward pass argument item = { 'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long), 'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long), 'features': feature_vec } if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item # ===================================================================================== # 4. PREDICTION LOGIC # ===================================================================================== def predict_on_test_set(checkpoint_path): """Loads the model and runs predictions on the blind test set.""" print("\n===== 🚀 STARTING PREDICTION PIPELINE =====\n") try: # --- 1. Load Tokenizer and Test Data --- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) test_df = pd.read_csv(TEST_DATA_PATH) test_df['features'] = test_df['features'].apply(ast.literal_eval) print(f"✔ Successfully loaded {len(test_df)} test records.") # --- 2. Load Model --- model_weights_path = os.path.join(checkpoint_path, "model.safetensors") if not os.path.exists(model_weights_path): raise FileNotFoundError(f"'model.safetensors' not found in {checkpoint_path}") model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES) state_dict = load_file(model_weights_path) model.load_state_dict(state_dict) print("✔ Model weights loaded successfully.") # --- 3. Run Prediction --- # Disable logging to wandb for cleaner prediction output training_args = TrainingArguments( output_dir="./temp_results", per_device_eval_batch_size=64, report_to="none" ) trainer = Trainer(model=model, args=training_args) print("\nRunning prediction on the test set...") # Create dataset without labels for the test set test_dataset = ReadabilityDataset( test_df['d3tok_text'].tolist(), test_df['features'].tolist(), labels=None, tokenizer_obj=tokenizer ) raw_predictions = trainer.predict(test_dataset) # --- 4. Process and Save Predictions --- # Get the continuous predictions preds = raw_predictions.predictions.flatten() # Round to nearest integer, clip to valid range (0-18), and shift to final label (1-19) final_labels = np.clip(np.round(preds), 0, TARGET_CLASSES - 1).astype(int) + 1 # Create submission DataFrame submission_df = pd.DataFrame({ 'id': test_df['id'], 'label': final_labels }) print(f"\nSaving predictions to: {PREDICTION_FILE_PATH}") submission_df.to_csv(PREDICTION_FILE_PATH, index=False) print("\n--- ✅ PREDICTION COMPLETE ---") print(f"Output saved successfully to your Google Drive.") print("---------------------------------") except Exception as e: print(f"❌ An error occurred: {e}") finally: gc.collect() torch.cuda.empty_cache() # ===================================================================================== # 5. EXECUTE THE SCRIPT # ===================================================================================== predict_on_test_set(CHECKPOINT_PATH)