# -*- coding: utf-8 -*- """re-train osman_v3 (1).ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/1Wxoc6ptE_3jGdJq2MYXhu7zBBpfBmRaO """ import pandas as pd import numpy as np import os import torch import zipfile from sklearn.metrics import cohen_kappa_score from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback ) from camel_tools.disambig.bert import BERTUnfactoredDisambiguator from camel_tools.tokenizers.word import simple_word_tokenize from camel_tools.utils.dediac import dediac_ar # ===================================================================================== # 1. CONFIGURATION # ===================================================================================== # --- Model & Environment Configuration --- MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg" NUM_LABELS = 1 TARGET_CLASSES = 19 BASE_DIR = '.' DATA_DIR = os.path.join(BASE_DIR, "data") CHECKPOINT_DIR = os.path.join(BASE_DIR, "results_dares_v3", f"regression_combined_{MODEL_NAME.split('/')[-1]}") SUBMISSION_DIR = os.path.join(BASE_DIR, "submission") os.makedirs(CHECKPOINT_DIR, exist_ok=True) os.makedirs(SUBMISSION_DIR, exist_ok=True) # --- File Paths --- BAREC_TRAIN_PATH = os.path.join(DATA_DIR, 'train.csv') BAREC_DEV_PATH = os.path.join(DATA_DIR, 'dev.csv') OSMAN_TRAIN_PATH = os.path.join(DATA_DIR, 'dares_train.csv') OSMAN_DEV_PATH = os.path.join(DATA_DIR, 'dares_dev.csv') BLIND_TEST_PATH = os.path.join(DATA_DIR, 'blind_test_data.csv') SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_regression_combined_final.csv") ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_regression_combined_final.zip") TRAIN_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'train_combined_preprocessed_d3tok.csv') DEV_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'dev_combined_preprocessed_d3tok.csv') # ===================================================================================== # 2. D3Tok PREPROCESSING FUNCTION # ===================================================================================== def preprocess_d3tok(text, disambiguator): """ Preprocesses text into the D3Tok format using BERTUnfactoredDisambiguator. """ if not isinstance(text, str) or not text.strip(): return "" tokens = simple_word_tokenize(text) disambiguated_sentence = disambiguator.disambiguate(tokens) d3tok_forms = [] for disambig_word in disambiguated_sentence: if disambig_word.analyses: analysis_dict = disambig_word.analyses[0][1] if 'd3tok' in analysis_dict: d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ") d3tok_forms.append(d3tok) else: d3tok_forms.append(disambig_word.word) else: d3tok_forms.append(disambig_word.word) return " ".join(d3tok_forms) # ===================================================================================== # 3. DATA LOADING, MAPPING, AND COMBINING # ===================================================================================== def load_or_preprocess_data(disambiguator): """ Loads preprocessed combined data if it exists. Otherwise, it loads BAREC and Osman data, standardizes, maps, combines, and preprocesses them with D3Tok format. """ print("--- Loading and Preparing Combined Readability Data ---") if os.path.exists(TRAIN_PREPROCESSED_PATH) and os.path.exists(DEV_PREPROCESSED_PATH): print(f"✔ Found preprocessed combined files. Loading them from:\n- {TRAIN_PREPROCESSED_PATH}\n- {DEV_PREPROCESSED_PATH}") train_df = pd.read_csv(TRAIN_PREPROCESSED_PATH) val_df = pd.read_csv(DEV_PREPROCESSED_PATH) train_df['text'] = train_df['text'].astype(str) val_df['text'] = val_df['text'].astype(str) print(f"✔ Successfully loaded {len(train_df)} training and {len(val_df)} validation records.") return train_df, val_df else: print("Preprocessed combined files not found. Starting one-time data integration and preprocessing...") try: # --- Load BAREC Data (assumed to be comma-separated) --- print("\n1. Loading BAREC data...") barec_train_df = pd.read_csv(BAREC_TRAIN_PATH) barec_val_df = pd.read_csv(BAREC_DEV_PATH) barec_train_df.rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'}, inplace=True) barec_val_df.rename(columns={'Sentence': 'text', 'Readability_Level_19': 'label'}, inplace=True) print(f" - Loaded {len(barec_train_df)} BAREC training records.") print(f" - Loaded {len(barec_val_df)} BAREC validation records.") # --- Load and Map Osman Data --- print("\n2. Loading and mapping Osman data...") # ===== FIX IS HERE ===== # The Osman files are separated by Tabs (\t), not commas. We must specify this. osman_train_df = pd.read_csv(OSMAN_TRAIN_PATH, sep='\t') osman_dev_df = pd.read_csv(OSMAN_DEV_PATH, sep='\t') # ======================= for df, name in [(osman_train_df, "training"), (osman_dev_df, "validation")]: text_col = 'Text' label_col = 'Fine-grained' if text_col not in df.columns or label_col not in df.columns: raise ValueError(f"Error in Osman {name} data: Columns '{text_col}' or '{label_col}' not found. Available columns: {df.columns.tolist()}") df.rename(columns={text_col: 'text', label_col: 'label'}, inplace=True) df['label'] = df['label'].str.replace('G', '', regex=False).astype(int) print(f" - Loaded and mapped {len(df)} Osman {name} records.") # --- Combine Datasets --- print("\n3. Combining BAREC and Osman datasets...") train_df = pd.concat([barec_train_df[['text', 'label']], osman_train_df[['text', 'label']]], ignore_index=True) val_df = pd.concat([barec_val_df[['text', 'label']], osman_dev_df[['text', 'label']]], ignore_index=True) print(f" - Combined training data size: {len(train_df)} records.") print(f" - Combined validation data size: {len(val_df)} records.") # --- Final Cleaning and Label Normalization --- train_df.dropna(subset=['text', 'label'], inplace=True) val_df.dropna(subset=['label', 'text'], inplace=True) train_df['text'] = train_df['text'].astype(str) val_df['text'] = val_df['text'].astype(str) train_df['label'] = train_df['label'].astype(int) - 1 val_df['label'] = val_df['label'].astype(int) - 1 train_df['label'] = train_df['label'].astype(float) val_df['label'] = val_df['label'].astype(float) print("\n4. Preprocessing all text to D3Tok format (this will only run once)...") train_df['text'] = train_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator)) val_df['text'] = val_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator)) print("✔ Text preprocessing finished.") print("\n5. Saving combined preprocessed data for future use...") train_df.to_csv(TRAIN_PREPROCESSED_PATH, index=False) val_df.to_csv(DEV_PREPROCESSED_PATH, index=False) print(f"** Saved preprocessed files to {TRAIN_PREPROCESSED_PATH} and {DEV_PREPROCESSED_PATH} **") return train_df, val_df except (FileNotFoundError, ValueError) as e: print(f"\n! SCRIPT STOPPED DUE TO AN ERROR: {e}") return None, None except Exception as e: print(f"! An unexpected error occurred during initial data processing: {e}") return None, None # ===================================================================================== # 4. INITIALIZATION AND DATASET PREPARATION # ===================================================================================== print("Initializing BERT Disambiguator for preprocessing...") bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa') train_df, val_df = load_or_preprocess_data(bert_disambiguator) if train_df is None: print("Stopping script due to data loading failure.") exit() else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # --- PyTorch Dataset Class --- class ReadabilityDataset(TorchDataset): def __init__(self, texts, labels=None): self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256) self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float) return item def __len__(self): return len(self.encodings.get('input_ids', [])) # --- Metrics Calculation --- def compute_metrics(p): preds = p.predictions.flatten() rounded_preds = np.round(preds) clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1).astype(int) labels = p.label_ids.astype(int) qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic') return {"qwk": qwk} # ===================================================================================== # 5. MODEL TRAINING # ===================================================================================== print("\n===== INITIALIZING REGRESSION MODEL AND TRAINER =====\n") model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS) train_dataset = ReadabilityDataset(train_df['text'].tolist(), train_df['label'].tolist()) val_dataset = ReadabilityDataset(val_df['text'].tolist(), val_df['label'].tolist()) training_args = TrainingArguments( output_dir=CHECKPOINT_DIR, num_train_epochs=18, per_device_train_batch_size=16, per_device_eval_batch_size=32, learning_rate=2e-5, warmup_ratio=0.1, weight_decay=0.01, logging_steps=100, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, metric_for_best_model="qwk", greater_is_better=True, save_total_limit=2, fp16=torch.cuda.is_available(), report_to="none" ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] ) # --- MODIFICATION FOR RESUMING --- # Check if a checkpoint exists in the output directory if os.path.isdir(CHECKPOINT_DIR): # Find the latest checkpoint directory (e.g., 'checkpoint-4044') checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith("checkpoint-72630")] if checkpoints: # Sort by step number to get the latest one latest_checkpoint = max(checkpoints, key=lambda x: int(x.split('-')[-1])) latest_checkpoint_path = os.path.join(CHECKPOINT_DIR, latest_checkpoint) print(f"Resuming training from checkpoint: {latest_checkpoint_path}") trainer.train(resume_from_checkpoint=latest_checkpoint_path) else: # No checkpoints found, start training from scratch print("No checkpoint found. Starting training from the beginning...") trainer.train() else: # Output directory doesn't even exist, start fresh print("No checkpoint directory found. Starting training from the beginning...") trainer.train() print("✔ Training finished.") """# test set result sentse Scores: {'accuracy': 48.8, 'accuracy+-1': 71.3, 'avg_abs_dist': 1.1, 'qwk': 83.9, 'accuracy_7': 62.5, 'accuracy_5': 67.6, 'accuracy_3': 74.3} # doc Scores: {'accuracy': 39.0, 'accuracy+-1': 87.0, 'avg_abs_dist': 0.8, 'qwk': 76.1, 'accuracy_7': 68.0, 'accuracy_5': 68.0, 'accuracy_3': 91.0} """ # ===================================================================================== # 6. FINAL PREDICTION AND SUBMISSION # ===================================================================================== print("\n===== FINAL PREDICTION AND SUBMISSION =====\n") try: test_df = pd.read_csv(BLIND_TEST_PATH) test_df.dropna(subset=['Sentence'], inplace=True) print("Preprocessing blind test text to D3Tok format...") # FIX 1: Use the correct variable name 'bert_disambiguator' test_df['processed_text'] = test_df['Sentence'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator)) print("Generating predictions on the test set...") test_dataset = ReadabilityDataset(test_df['processed_text'].tolist()) predictions = trainer.predict(test_dataset) raw_preds = predictions.predictions.flatten() rounded_preds = np.round(raw_preds) clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1) test_df['Prediction'] = (clipped_preds + 1).astype(int) # FIX 2: Use the 'ID' column from the test file and rename it to 'Sentence ID' for the submission. submission_df = test_df[['ID', 'Prediction']] submission_df = submission_df.rename(columns={'ID': 'Sentence ID'}) print(f"Saving prediction file to: {SUBMISSION_PATH}") submission_df.to_csv(SUBMISSION_PATH, index=False) print(f"\nCompressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...") with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf: zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH)) print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.") except FileNotFoundError: print(f"! ERROR: Test file not found. Make sure 'blind_test_data.csv' is in the '{DATA_DIR}' directory.") except KeyError: # Add a more specific error message for this common problem print("! KEY ERROR: Could not find the expected 'ID' column in 'blind_test_data.csv'. Please check the file's column names.") except Exception as e: print(f"An error occurred during final prediction: {e}") print("\n--- Script Finished ---")