# -*- coding: utf-8 -*-
"""hybird_constrinc_samer_n_lex_reg (82.9).ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/17rYEHl4vHDcV_mt-GgizkBfTPgAJn5l1

# Cell 2: Connect to Google Drive
"""

# Authorize Colab to access your Google Drive
from google.colab import drive
drive.mount('/content/drive')

"""#  Define All Paths on Google Drive"""

# =====================================================================================
# 1. IMPORTS & CONFIGURATION
# =====================================================================================

import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
import gc
import ast # To safely evaluate string-formatted lists

# --- Model & Training ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"

NUM_LABELS = 1
TARGET_CLASSES = 19
NUM_FEATURES = 7

# --- IMPORTANT: Set the path to your project folder on Google Drive ---
PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition'

# --- File & Directory Paths (Now relative to your Google Drive) ---
BASE_DIR = PROJECT_DRIVE_PATH
PROCESSED_DATA_DIR = os.path.join(BASE_DIR, "lex")
CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"hybrid_regression_{MODEL_NAME.split('/')[-1]}")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")

# Ensure the output directories exist on your Google Drive
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# Paths to the preprocessed input files on Google Drive
TRAIN_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'train_processed_full.csv')
DEV_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'dev_processed_full.csv')
TEST_PROCESSED_PATH = os.path.join(PROCESSED_DATA_DIR, 'test_processed_full.csv')

# --- Submission Paths on Google Drive ---
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_regression.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_regression.zip")

print(f"✔️ All paths configured to use Google Drive folder: {BASE_DIR}")

"""#  Data Loading and Helper Functions"""

# =====================================================================================
# 2. DATA LOADING FUNCTION
# =====================================================================================

def load_preprocessed_data():
    """Loads the pre-processed CSV files directly from Google Drive."""
    print("\n--- Loading Preprocessed Data from Google Drive ---")
    try:
        train_df = pd.read_csv(TRAIN_PROCESSED_PATH)
        val_df = pd.read_csv(DEV_PROCESSED_PATH)

        print("Converting 'features' column from string to list...")
        # This can be slow; using apply is fine here.
        train_df['features'] = train_df['features'].apply(ast.literal_eval)
        val_df['features'] = val_df['features'].apply(ast.literal_eval)

        # Convert labels for regression
        train_df['label'] = (train_df['label'].astype(int) - 1).astype(float)
        val_df['label'] = (val_df['label'].astype(int) - 1).astype(float)

        print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
        return train_df, val_df
    except FileNotFoundError as e:
        print(f"❌ ERROR: Preprocessed file not found: {e}.")
        print("Please make sure your data is uploaded to the correct Google Drive folder.")
        return None, None
    except Exception as e:
        print(f"❌ ERROR during data loading: {e}")
        return None, None

# =====================================================================================
# 3. DATASET, MODEL & METRICS DEFINITIONS
# =====================================================================================

class ReadabilityDataset(TorchDataset):
    """
    Custom PyTorch Dataset for handling texts, numerical features, and labels.
    """
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer_obj
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)

        # Tokenize the text
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )

        # Create dictionary of tensors
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }

        # Add labels if they exist (for training and validation)
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)

        return item

class HybridRegressionModel(nn.Module):
    """
    A hybrid model that combines a pre-trained transformer with a regression head
    that accepts additional numerical features.
    """
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self._keys_to_ignore_on_save = []

        # --- FIX: The layer must be named 'head' to match the saved checkpoint file ---
        self.head = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size + num_extra_features, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, NUM_LABELS)
        )

    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_token_output = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_token_output, features], dim=1)

        # --- FIX: Use 'self.head' here as well ---
        logits = self.head(combined_features)

        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))

        return (loss, logits) if loss is not None else logits


def compute_metrics(p):
    """
    Computes the Quadratic Weighted Kappa (QWK) score for regression predictions.
    """
    # Extract predictions and true labels
    preds = p.predictions.flatten()
    labels = p.label_ids

    # Round predictions to the nearest integer and clip to the valid label range
    clipped_preds = np.clip(np.round(preds), 0, TARGET_CLASSES - 1)

    # Calculate QWK score
    qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')

    # Calculate Mean Squared Error
    mse = ((preds - labels) ** 2).mean()

    return {
        "qwk": qwk,
        "mse": mse
    }

"""#  Initialize Model and Train"""

# =====================================================================================
# 4. & 5. MAIN EXECUTION FUNCTIONS
# =====================================================================================

import json # Added import for json

def main_train():
    print("===== 🚀 STARTING HYBRID REGRESSION MODEL PIPELINE =====\n")

    print("Initializing Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_df, val_df = load_preprocessed_data()
    if train_df is None:
        print("\n! Aborting script due to data loading failure.")
        return

    print("\nCreating Torch Datasets...")
    train_dataset = ReadabilityDataset(train_df['d3tok_text'].tolist(), train_df['features'].tolist(), train_df['label'].tolist(), tokenizer)
    val_dataset = ReadabilityDataset(val_df['d3tok_text'].tolist(), val_df['features'].tolist(), val_df['label'].tolist(), tokenizer)
    print("✔ Datasets created.")

    print("\nInitializing Hybrid Regression Model...")
    model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)

    # Training arguments are set to work well on Colab
    training_args = TrainingArguments(
        output_dir=CHECKPOINT_DIR,
        num_train_epochs=8,
        per_device_train_batch_size=16, # Adjust if you face OOM errors
        per_device_eval_batch_size=64,
        learning_rate=3e-5,
        warmup_ratio=0.1,
        weight_decay=0.01,
        logging_steps=100,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="qwk",
        greater_is_better=True,
        save_total_limit=2,
        fp16=torch.cuda.is_available(), # Automatically uses mixed precision on GPU
        report_to="none" # Disables wandb or other reporting
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    print("\nStarting model training... Checkpoints will be saved to Google Drive.")

    # Check for latest checkpoint to resume from
    latest_checkpoint = None
    if os.path.exists(CHECKPOINT_DIR):
        checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")]
        if checkpoints:
            checkpoints.sort(key=lambda x: int(x.split('-')[-1]))
            latest_checkpoint = os.path.join(CHECKPOINT_DIR, checkpoints[-1])
            print(f"Resuming training from latest checkpoint: {latest_checkpoint}")
        else:
             print("No checkpoints found to resume training from. Starting from scratch.")
    else:
        print("Checkpoint directory not found. Starting training from scratch.")


    trainer.train(resume_from_checkpoint=latest_checkpoint)
    print("✔ Training finished.")

    # Clean up memory
    del model, trainer, train_dataset, val_dataset, train_df, val_df
    gc.collect()
    torch.cuda.empty_cache()

def main_predict():
    print("\n===== 🏆 GENERATING FINAL PREDICTIONS & SUBMISSION =====\n")
    try:
        print("Initializing Tokenizer for prediction...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        print("Loading preprocessed test data from Google Drive...")
        test_df = pd.read_csv(TEST_PROCESSED_PATH)
        test_df['features'] = test_df['features'].apply(ast.literal_eval)

        print("\nLoading the best trained model from Google Drive checkpoints...")
        if not os.path.exists(CHECKPOINT_DIR):
             raise FileNotFoundError(f"Checkpoint directory not found on Google Drive: {CHECKPOINT_DIR}.")

        checkpoints = sorted(
            [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-")],
            key=lambda x: int(x.split('-')[-1])
        )

        if not checkpoints:
             raise FileNotFoundError(f"No checkpoint folders found in {CHECKPOINT_DIR}.")

        # Find the latest valid checkpoint directory
        latest_checkpoint_dir = os.path.join(CHECKPOINT_DIR, checkpoints[-1])
        print(f"Using latest checkpoint directory: {latest_checkpoint_dir}")

        # --- FIX: Check for 'model.safetensors' first, then 'pytorch_model.bin' ---
        model_path_safetensors = os.path.join(latest_checkpoint_dir, "model.safetensors")
        model_path_bin = os.path.join(latest_checkpoint_dir, "pytorch_model.bin")
        state_dict = None

        if os.path.exists(model_path_safetensors):
            print(f"Found 'model.safetensors', loading weights...")
            state_dict = load_file(model_path_safetensors)
        elif os.path.exists(model_path_bin):
            print(f"Found 'pytorch_model.bin', loading weights...")
            state_dict = torch.load(model_path_bin)
        else:
            raise FileNotFoundError(f"Neither 'model.safetensors' nor 'pytorch_model.bin' found in {latest_checkpoint_dir}")

        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        model.load_state_dict(state_dict)

        trainer = Trainer(model=model, args=TrainingArguments(output_dir=CHECKPOINT_DIR))

        print("Generating predictions on the test set...")
        test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer)
        predictions = trainer.predict(test_dataset)

        clipped_preds = np.clip(np.round(predictions.predictions.flatten()), 0, TARGET_CLASSES - 1)
        test_df['Prediction'] = (clipped_preds + 1).astype(int)

        submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]

        print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
        submission_df.to_csv(SUBMISSION_PATH, index=False)

        print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
        with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))

        print(f"✔ Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully in your Drive!")

    except FileNotFoundError as e:
        print(f"❌ ERROR: File not found: {e}. Ensure training was completed and checkpoints exist.")
    except Exception as e:
        print(f"❌ An error occurred during final prediction: {e}")

"""# Final Prediction and Save Submission to Drive"""

# =====================================================================================
# 6. SCRIPT RUNNER
# =====================================================================================

# Start the training process
main_train()

# Once training is done, generate predictions
main_predict()

print("\n--- ✅ All Done! Check your Google Drive for results and submission files. ---")

# =====================================================================================
# 1. IMPORTS
# =====================================================================================
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
import ast
from safetensors.torch import load_file
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
import gc

# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
# --- Model & Data Specs ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
NUM_FEATURES = 7

# --- Path to your specific checkpoint ---
# This is the full path to the checkpoint folder you want to use for predictions.
CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944"

# --- Other Important Paths ---
PROJECT_DRIVE_PATH = '/content/drive/MyDrive/BAREC_Competition'
TEST_PROCESSED_PATH = os.path.join(PROJECT_DRIVE_PATH, "lex", 'test_processed_full.csv')
SUBMISSION_DIR = os.path.join(PROJECT_DRIVE_PATH, "submission")

# --- Submission File Names ---
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "standalone_submission.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "standalone_submission.zip")

# Ensure the submission directory exists
os.makedirs(SUBMISSION_DIR, exist_ok=True)

print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")


# =====================================================================================
# 3. REQUIRED CLASS DEFINITIONS
# These classes MUST be defined so Python knows the structure of your model and data.
# =====================================================================================

class ReadabilityDataset(TorchDataset):
    """Custom PyTorch Dataset for handling texts, features, and labels."""
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer_obj
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

class HybridRegressionModel(nn.Module):
    """The architecture of your saved model. This must match the training script."""
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        self._keys_to_ignore_on_save = []
        self.head = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size + num_extra_features, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, NUM_LABELS)
        )

    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        cls_token_output = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_token_output, features], dim=1)
        logits = self.head(combined_features)
        loss = None
        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, logits) if loss is not None else logits

# =====================================================================================
# 4. PREDICTION LOGIC
# =====================================================================================

def generate_predictions(checkpoint_path):
    """Loads the model and runs predictions on the test set."""
    print("\n===== 🏆 STARTING PREDICTION PIPELINE =====\n")
    try:
        # --- 1. Load Tokenizer and Test Data ---
        print("Initializing Tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        print(f"Loading preprocessed test data from: {TEST_PROCESSED_PATH}")
        test_df = pd.read_csv(TEST_PROCESSED_PATH)
        test_df['features'] = test_df['features'].apply(ast.literal_eval)
        print(f"✔ Successfully loaded {len(test_df)} test records.")

        # --- 2. Load Model Weights ---
        print(f"\nLoading model from checkpoint: {checkpoint_path}")
        model_weights_path = os.path.join(checkpoint_path, "model.safetensors")

        if not os.path.exists(model_weights_path):
            raise FileNotFoundError(f"Could not find 'model.safetensors' in the specified checkpoint directory.")

        # Initialize the model architecture
        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        # Load the saved weights into the model
        state_dict = load_file(model_weights_path)
        model.load_state_dict(state_dict)
        print("✔ Model weights loaded successfully.")

        # --- 3. Run Prediction ---
        # The Trainer is a convenient way to run predictions in batches
        trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=64))

        print("\nGenerating predictions on the test set...")
        test_dataset = ReadabilityDataset(test_df['d3tok_text'].tolist(), test_df['features'].tolist(), tokenizer_obj=tokenizer)
        raw_predictions = trainer.predict(test_dataset)
        print("✔ Predictions generated.")

        # --- 4. Process and Save Submission File ---
        # Clip predictions to the valid range [0, 18] and round to the nearest integer
        clipped_preds = np.clip(np.round(raw_predictions.predictions.flatten()), 0, TARGET_CLASSES - 1)
        # Add 1 to convert from 0-18 range to 1-19 range for the final label
        test_df['Prediction'] = (clipped_preds + 1).astype(int)

        # Prepare the submission file in the required format
        submission_df = test_df.rename(columns={'ID': 'id'})[['id', 'Prediction']]

        print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
        submission_df.to_csv(SUBMISSION_PATH, index=False)

        print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
        with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))

        print(f"\n--- ✅ SUCCESS! Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created in your Drive! ---")

    except FileNotFoundError as e:
        print(f"❌ ERROR: A required file was not found. Please check your paths.")
        print(e)
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")

    finally:
        # Clean up memory
        del model, trainer, test_dataset
        gc.collect()
        torch.cuda.empty_cache()


# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
if __name__ == "__main__":
    generate_predictions(CHECKPOINT_PATH)


# =====================================================================================
# 1. IMPORTS
# =====================================================================================
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import ast
from torch.utils.data import Dataset as TorchDataset
from sklearn.metrics import cohen_kappa_score
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
import gc

# This import is crucial for loading the model file
from safetensors.torch import load_file


# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
NUM_FEATURES = 7
CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944"
EVAL_DATA_PATH = "/content/drive/MyDrive/BAREC_Competition/lex/dev_processed_full.csv"

print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")


# =====================================================================================
# 3. CORRECT MODEL ARCHITECTURE AND HELPERS
# =====================================================================================

class HybridRegressionModel(nn.Module):
    """
    This is the correct, complex architecture from your notebook that matches checkpoint-48944.
    """
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, features], dim=1)
        logits = self.head(combined_features).squeeze(-1)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(logits, labels.float())
            return (loss, logits)
        return logits

class ReadabilityDataset(TorchDataset):
    """Custom Dataset to format data for the model."""
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer_obj
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(
            text, None, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_token_type_ids=True
        )
        # Rename 'numerical_features' to 'features' to match the model's forward pass
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

def compute_metrics(p):
    """Computes QWK and MSE for evaluation."""
    preds = p.predictions.flatten()
    labels = p.label_ids
    clipped_preds = np.clip(np.round(preds), 0, TARGET_CLASSES - 1)
    qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
    mse = ((preds - labels) ** 2).mean()
    return {"qwk": qwk, "mse": mse}


# =====================================================================================
# 4. STANDALONE EVALUATION LOGIC
# =====================================================================================

def evaluate_checkpoint(checkpoint_path):
    print("\n===== 🚀 STARTING EVALUATION PIPELINE =====\n")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        eval_df = pd.read_csv(EVAL_DATA_PATH)
        eval_df['features'] = eval_df['features'].apply(ast.literal_eval)
        eval_df['label'] = (eval_df['label'].astype(int) - 1).astype(float)
        print(f"✔ Successfully loaded {len(eval_df)} evaluation records.")

        # Correctly load the model.safetensors file
        model_weights_path = os.path.join(checkpoint_path, "model.safetensors")
        if not os.path.exists(model_weights_path):
            raise FileNotFoundError(f"'model.safetensors' not found in {checkpoint_path}")

        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        state_dict = load_file(model_weights_path)
        model.load_state_dict(state_dict)
        print("✔ Model weights loaded successfully.")

        trainer = Trainer(
            model=model,
            args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=64),
            compute_metrics=compute_metrics,
        )

        print("\nRunning evaluation...")
        eval_dataset = ReadabilityDataset(
            eval_df['d3tok_text'].tolist(),
            eval_df['features'].tolist(),
            eval_df['label'].tolist(),
            tokenizer_obj=tokenizer
        )

        results = trainer.evaluate(eval_dataset)

        print("\n--- ✅ EVALUATION COMPLETE ---")
        print(f"Quadratic Weighted Kappa (QWK): {results.get('eval_qwk'):.4f}")
        print(f"Mean Squared Error (MSE):       {results.get('eval_mse'):.4f}")
        print("---------------------------------")

    except Exception as e:
        print(f"❌ An error occurred: {e}")
    finally:
        gc.collect()
        torch.cuda.empty_cache()


# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
evaluate_checkpoint(CHECKPOINT_PATH)

# =====================================================================================
# 1. IMPORTS
# =====================================================================================
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import ast
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
import gc

# This import is crucial for loading the model file
from safetensors.torch import load_file


# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
TARGET_CLASSES = 19
NUM_FEATURES = 7
CHECKPOINT_PATH = "/content/drive/MyDrive/BAREC_Competition/results/hybrid_regression_readability-arabertv2-d3tok-reg/checkpoint-48944"

# Path to the blind test set
TEST_DATA_PATH = "/content/drive/MyDrive/BAREC_Competition/lex/test_processed_full.csv"

# Path for the final output file
PREDICTION_FILE_PATH = "/content/drive/MyDrive/BAREC_Competition/submission/test_predictions.csv"

print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")
print(f"✔️ Predicting on data from: {TEST_DATA_PATH}")


# =====================================================================================
# 3. CORRECT MODEL ARCHITECTURE AND DATASET CLASS
# =====================================================================================

class HybridRegressionModel(nn.Module):
    """
    This is the correct, complex architecture from your notebook that matches checkpoint-48944.
    """
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, features], dim=1)
        logits = self.head(combined_features).squeeze(-1)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(logits, labels.float())
            return (loss, logits)
        return logits

class ReadabilityDataset(TorchDataset):
    """Custom Dataset to format data for the model."""
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer_obj
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(
            text, None, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_token_type_ids=True
        )
        # Rename to 'features' to match the model's forward pass argument
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


# =====================================================================================
# 4. PREDICTION LOGIC
# =====================================================================================

def predict_on_test_set(checkpoint_path):
    """Loads the model and runs predictions on the blind test set."""
    print("\n===== 🚀 STARTING PREDICTION PIPELINE =====\n")
    try:
        # --- 1. Load Tokenizer and Test Data ---
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        test_df = pd.read_csv(TEST_DATA_PATH)
        test_df['features'] = test_df['features'].apply(ast.literal_eval)
        print(f"✔ Successfully loaded {len(test_df)} test records.")

        # --- 2. Load Model ---
        model_weights_path = os.path.join(checkpoint_path, "model.safetensors")
        if not os.path.exists(model_weights_path):
            raise FileNotFoundError(f"'model.safetensors' not found in {checkpoint_path}")

        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        state_dict = load_file(model_weights_path)
        model.load_state_dict(state_dict)
        print("✔ Model weights loaded successfully.")

        # --- 3. Run Prediction ---
        # Disable logging to wandb for cleaner prediction output
        training_args = TrainingArguments(
            output_dir="./temp_results",
            per_device_eval_batch_size=64,
            report_to="none"
        )
        trainer = Trainer(model=model, args=training_args)

        print("\nRunning prediction on the test set...")
        # Create dataset without labels for the test set
        test_dataset = ReadabilityDataset(
            test_df['d3tok_text'].tolist(),
            test_df['features'].tolist(),
            labels=None,
            tokenizer_obj=tokenizer
        )

        raw_predictions = trainer.predict(test_dataset)

        # --- 4. Process and Save Predictions ---
        # Get the continuous predictions
        preds = raw_predictions.predictions.flatten()

        # Round to nearest integer, clip to valid range (0-18), and shift to final label (1-19)
        final_labels = np.clip(np.round(preds), 0, TARGET_CLASSES - 1).astype(int) + 1

        # Create submission DataFrame
        submission_df = pd.DataFrame({
            'id': test_df['id'],
            'label': final_labels
        })

        print(f"\nSaving predictions to: {PREDICTION_FILE_PATH}")
        submission_df.to_csv(PREDICTION_FILE_PATH, index=False)

        print("\n--- ✅ PREDICTION COMPLETE ---")
        print(f"Output saved successfully to your Google Drive.")
        print("---------------------------------")

    except Exception as e:
        print(f"❌ An error occurred: {e}")
    finally:
        gc.collect()
        torch.cuda.empty_cache()


# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
predict_on_test_set(CHECKPOINT_PATH)