# Training pipeline for the spam classifier
# ENGT 375 Project - Spring 2026 - ODU
#
# This script loads two email datasets, engineers features, compares three
# models (Random Forest, Logistic Regression, SVM), builds a VotingClassifier
# ensemble, and saves all artifacts needed by the Gradio app.

import json
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, precision_recall_curve
from scipy.sparse import hstack, csr_matrix
import joblib

# Import our shared utility functions (preprocess_text, compute_metadata_features)
from utils import preprocess_text, compute_metadata_features, META_FEATURE_NAMES

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
project_dir = Path(__file__).parent
data_dir = project_dir / 'data'
models_dir = project_dir / 'models'
random_state = 42

# ---------------------------------------------------------------------------
# Helper: collect accuracy, precision, recall, f1 into a dict
# ---------------------------------------------------------------------------
def get_metrics(y_true, y_pred):
    """Return a dict with accuracy, precision, recall, and f1 for the
    positive class (spam = 1).  We pull these from classification_report
    so we don't have to compute them twice."""
    report = classification_report(y_true, y_pred, output_dict=True)
    return {
        'accuracy':  round(report['accuracy'], 4),
        'precision': round(report['1']['precision'], 4),
        'recall':    round(report['1']['recall'], 4),
        'f1':        round(report['1']['f1-score'], 4),
    }


# ===================================================================
# 1. DATA LOADING
# ===================================================================
print('=' * 60)
print('STEP 1 — Loading datasets')
print('=' * 60)

# --- 1a. Enron corpus (~33k real corporate emails, gold-standard benchmark) -
enron_csv = data_dir / 'raw' / 'enron' / 'enron_spam_data.csv'
frames = []

if enron_csv.exists():
    print('Loading Enron corpus ...')
    df_enron = pd.read_csv(enron_csv)
    df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'})
    df_enron['label'] = df_enron['label'].str.strip().str.lower()
    df_enron = df_enron[df_enron['label'].isin(['ham', 'spam'])][['text', 'label']].dropna()
    frames.append(df_enron)
    print('  Enron: %d emails' % len(df_enron))
else:
    print('WARNING: Enron CSV not found at %s' % enron_csv)

# --- 1b. puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, etc.) ----
# Enron sub-corpus excluded to avoid duplicating emails already loaded above.
puyang_parquet = data_dir / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet'

if puyang_parquet.exists():
    print('Loading puyang2025 seven-phishing-email-datasets ...')
    df_puyang = pd.read_parquet(puyang_parquet)
    df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron']
    df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'})
    df_puyang = df_puyang[['text', 'label']].dropna()
    frames.append(df_puyang)
    print('  puyang2025 (Enron excluded): %d emails' % len(df_puyang))
else:
    print('WARNING: puyang2025 parquet not found at %s' % puyang_parquet)

# --- 1c. zefang phishing dataset — 18k phishing-labeled emails --------------
# Phishing is treated as the positive (spam) class for binary classification.
zefang_parquet = data_dir / 'raw' / 'zefang' / 'phishing_emails.parquet'

if zefang_parquet.exists():
    print('Loading zefang phishing dataset ...')
    df_zefang = pd.read_parquet(zefang_parquet)
    df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'})
    df_zefang = df_zefang[['text', 'label']].dropna()
    frames.append(df_zefang)
    print('  zefang phishing: %d emails' % len(df_zefang))
else:
    print('WARNING: zefang parquet not found at %s' % zefang_parquet)

if not frames:
    raise RuntimeError('No training data found! Check data/raw/ folder.')

# --- 1d. Combine and deduplicate -------------------------------------------
df = pd.concat(frames, ignore_index=True)
print('Combined: %d emails' % len(df))

before_dedup = len(df)
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
print('After dedup: %d emails (removed %d duplicates)' % (len(df), before_dedup - len(df)))

ham_count = (df['label'] == 'ham').sum()
spam_count = (df['label'] == 'spam').sum()
print('  Ham: %d | Spam: %d' % (ham_count, spam_count))

if len(df) == 0:
    raise RuntimeError('No training data found!  Check the data/ folder.')


# ===================================================================
# 2. FEATURE ENGINEERING
# ===================================================================
print()
print('=' * 60)
print('STEP 2 — Feature engineering')
print('=' * 60)

# 2a. Preprocess (stem, remove stopwords, etc.)
print('Preprocessing text ...')
df['clean_text'] = df['text'].apply(preprocess_text)

# 2b. TF-IDF on the cleaned text
print('Fitting TF-IDF (max 3000 features, 1-3 grams) ...')
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.90,
    sublinear_tf=True,
)
X_tfidf = tfidf.fit_transform(df['clean_text'])
print('  TF-IDF shape: %s' % str(X_tfidf.shape))

# 2c. 24 metadata features (things like exclamation density, caps ratio, ...)
print('Computing 24 metadata features ...')
X_meta_raw = compute_metadata_features(df['text'].values)

# Scale metadata to 0-1 so they match the TF-IDF scale
meta_scaler = MinMaxScaler()
X_meta_scaled = meta_scaler.fit_transform(X_meta_raw)

# 2d. Combine TF-IDF (sparse) + metadata (dense -> sparse) into one matrix
X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)])
feature_names = list(tfidf.get_feature_names_out()) + META_FEATURE_NAMES
print('Total features: %d (%d TF-IDF + %d metadata)' % (
    len(feature_names), X_tfidf.shape[1], len(META_FEATURE_NAMES)
))

# 2e. Encode labels: 1 = spam, 0 = ham
y = (df['label'] == 'spam').astype(int)


# ===================================================================
# 3. MODEL COMPARISON
# ===================================================================
print()
print('=' * 60)
print('STEP 3 — Train / test split + model comparison')
print('=' * 60)

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y,
    test_size=0.30,
    random_state=random_state,
    stratify=y,
)
print('Train: %d | Test: %d' % (X_train.shape[0], X_test.shape[0]))

# We'll store metrics for each model here
all_metrics = {}

# --- Random Forest ----------------------------------------------------------
print('\nTraining Random Forest ...')
rf = RandomForestClassifier(
    n_estimators=200,
    n_jobs=-1,
    class_weight='balanced',
    random_state=random_state,
)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(classification_report(y_test, rf_pred, target_names=['Ham', 'Spam']))
all_metrics['RandomForest'] = get_metrics(y_test, rf_pred)

# --- Logistic Regression ----------------------------------------------------
print('Training Logistic Regression ...')
lr = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    random_state=random_state,
)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print(classification_report(y_test, lr_pred, target_names=['Ham', 'Spam']))
all_metrics['LogisticRegression'] = get_metrics(y_test, lr_pred)

# --- SVM (linear kernel) ----------------------------------------------------
# We use LinearSVC + CalibratedClassifierCV instead of SVC(probability=True)
# because SVC is O(n^2) and would take hours on 100K emails.
# CalibratedClassifierCV wraps LinearSVC to add predict_proba() support,
# which is needed for the soft-voting ensemble.
print('Training SVM (LinearSVC + calibration for probabilities) ...')
base_svm = LinearSVC(
    class_weight='balanced',
    max_iter=2000,
    random_state=random_state,
)
svm = CalibratedClassifierCV(base_svm, cv=5)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test, svm_pred, target_names=['Ham', 'Spam']))
all_metrics['SVM'] = get_metrics(y_test, svm_pred)


# ===================================================================
# 4. ENSEMBLE (VotingClassifier)
# ===================================================================
print()
print('=' * 60)
print('STEP 4 — Voting ensemble (soft voting)')
print('=' * 60)

# Build fresh estimators (not the already-fitted ones) for VotingClassifier
ensemble = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(
            n_estimators=200, n_jobs=-1,
            class_weight='balanced', random_state=random_state)),
        ('lr', LogisticRegression(
            max_iter=1000, class_weight='balanced',
            random_state=random_state)),
        ('svm', CalibratedClassifierCV(
            LinearSVC(class_weight='balanced', max_iter=2000,
                      random_state=random_state), cv=5)),
    ],
    voting='soft',
    n_jobs=-1,
)
print('Fitting ensemble (all three models from scratch) ...')
ensemble.fit(X_train, y_train)

ens_pred = ensemble.predict(X_test)
print(classification_report(y_test, ens_pred, target_names=['Ham', 'Spam']))
all_metrics['VotingEnsemble'] = get_metrics(y_test, ens_pred)

# --- Optimal threshold via precision-recall curve ---
# We want to find the threshold where ham precision >= 99%.
# That means: when the model says "ham", it's right at least 99% of the time.
print('Computing optimal threshold (target: 99%% ham precision) ...')
y_test_proba = ensemble.predict_proba(X_test)[:, 1]  # P(spam)
precision_arr, recall_arr, thresholds_pr = precision_recall_curve(y_test, y_test_proba)

optimal_threshold = 0.50  # fallback
for t in sorted(thresholds_pr, reverse=True):
    # Emails with P(spam) < t are classified as ham
    predicted_ham_mask = y_test_proba < t
    if predicted_ham_mask.sum() == 0:
        continue
    # Of those predicted-ham emails, how many are actually ham?
    ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum()
    if ham_precision >= 0.99:
        optimal_threshold = t
        break

print('Optimal threshold: %.4f' % optimal_threshold)


# ===================================================================
# 5. SAVE ARTIFACTS
# ===================================================================
print()
print('=' * 60)
print('STEP 5 — Saving artifacts to models/')
print('=' * 60)

models_dir.mkdir(exist_ok=True)

# 5a. Voting model
joblib.dump(ensemble, models_dir / 'voting_model.joblib')
print('  Saved voting_model.joblib')

# 5b. TF-IDF vectorizer
joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib')
print('  Saved tfidf_vectorizer.joblib')

# 5c. Metadata scaler
joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib')
print('  Saved meta_scaler.joblib')

# 5d. Feature names list (TF-IDF names + META_FEATURE_NAMES)
joblib.dump(feature_names, models_dir / 'feature_names.joblib')
print('  Saved feature_names.joblib (%d names)' % len(feature_names))

# 5e. Optimal threshold
joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib')
print('  Saved optimal_threshold.joblib (%.4f)' % optimal_threshold)

# 5f. Training sample (200 dense rows for LIME / SHAP explanations)
rng = np.random.RandomState(random_state)
sample_size = min(200, X_train.shape[0])
sample_idx = rng.choice(X_train.shape[0], size=sample_size, replace=False)
training_sample = X_train[sample_idx].toarray()  # convert sparse -> dense
joblib.dump(training_sample, models_dir / 'training_sample.joblib')
print('  Saved training_sample.joblib  shape=%s' % str(training_sample.shape))

# 5g. Training report (JSON with metrics for every model)
report = {
    'random_state': random_state,
    'train_size': int(X_train.shape[0]),
    'test_size': int(X_test.shape[0]),
    'total_features': len(feature_names),
    'optimal_threshold': round(optimal_threshold, 4),
    'models': all_metrics,
}
report_path = models_dir / 'training_report.json'
with open(report_path, 'w') as f:
    json.dump(report, f, indent=2)
print('  Saved training_report.json')

print()
print('=' * 60)
print('DONE — all 7 artifacts saved to models/')
print('=' * 60)