# Retrain the spam classifier model
# ENGT 375 Project - Spring 2026 - ODU
# Loads Kaggle spam dataset + GitHub email-dataset

import re
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
# I found CalibratedClassifierCV in the sklearn docs - it makes the probability
# predictions more accurate instead of just using the raw RF outputs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, f1_score, precision_recall_curve
# MinMaxScaler normalizes metadata features to 0-1 range so they're on the same
# scale as the TF-IDF features (learned about scaling importance from Module 7A kNN)
from sklearn.preprocessing import MinMaxScaler
# hstack and csr_matrix let me combine the TF-IDF sparse matrix with the metadata
# features without converting everything to a dense array (saves a lot of memory)
from scipy.sparse import hstack, csr_matrix
import joblib
import json
import hashlib
import requests
from tqdm import tqdm
from utils_student import (preprocess_text, compute_metadata_features,
                           spam_context_phrases, ham_context_phrases,
                           registration_phrases, url_shorteners,
                           legitimate_platforms, OLLAMA_API, LLM_FEATURE_NAMES)

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Set up folder paths
project_dir = Path(__file__).parent
data_dir = project_dir / 'data' / 'processed'
raw_dir = project_dir / 'data' / 'raw'
models_dir = project_dir / 'models'
random_state = 42

enron_csv = project_dir / 'data' / 'raw' / 'enron' / 'enron_spam_data.csv'
puyang_parquet = project_dir / 'data' / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet'
zefang_parquet = project_dir / 'data' / 'raw' / 'zefang' / 'phishing_emails.parquet'
SKIP_LLM_TRAINING = True  # LLM features too slow for large datasets

def check_ollama_available(model='qwen3.5:2b'):
    """Check if Ollama is running and has the specified model."""
    try:
        resp = requests.get('http://localhost:11434/api/tags', timeout=2)
        if resp.status_code == 200:
            models = [m['name'] for m in resp.json().get('models', [])]
            return any(model in m for m in models)
    except Exception:
        pass
    return False


def extract_llm_features_single(text, model='qwen3.5:2b'):
    """Extract intent and tone features for a single email via Ollama."""
    truncated = text[:500]
    prompt = (
        'Rate this email on these dimensions (0.0 to 1.0).\n'
        'Respond with ONLY valid JSON: {"promotional": X, "transactional": X, '
        '"personal": X, "phishing": X, "urgency": X, "formality": X}\n'
        '/no_think\n\n'
        'Email: "%s"' % truncated
    )
    try:
        resp = requests.post(OLLAMA_API, json={
            'model': model,
            'messages': [{'role': 'user', 'content': prompt}],
            'stream': False,
            'think': False,
            'options': {'temperature': 0.1, 'num_predict': 100}
        }, timeout=30)
        if resp.status_code == 200:
            content = resp.json().get('message', {}).get('content', '')
            content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
            json_match = re.search(r'\{[^}]+\}', content)
            if json_match:
                data = json.loads(json_match.group())
                return [
                    float(data.get('promotional', 0.5)),
                    float(data.get('transactional', 0.5)),
                    float(data.get('personal', 0.5)),
                    float(data.get('phishing', 0.5)),
                    float(data.get('urgency', 0.5)),
                    float(data.get('formality', 0.5)),
                ]
    except Exception:
        pass
    return [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]


# I use a hash of each email text as a cache key so I don't have to re-run
# the LLM for emails I've already processed (the LLM calls are really slow)
def get_text_hash(text):
    """Get a stable hash for an email text."""
    return hashlib.sha256(text.encode('utf-8', errors='replace')).hexdigest()[:16]


CACHE_PATH = data_dir / 'llm_features_cache.csv'


def get_or_compute_llm_features(texts, model='qwen3.5:2b'):
    """Load cached LLM features, compute missing ones, return combined array."""
    hashes = [get_text_hash(t) for t in texts]

    # Load existing cache
    cached = {}
    if CACHE_PATH.exists():
        cache_df = pd.read_csv(CACHE_PATH, index_col='hash')
        for h in cache_df.index:
            cached[h] = cache_df.loc[h, LLM_FEATURE_NAMES].values.tolist()
        print('  Loaded %d cached LLM features' % len(cached))

    # Identify uncached
    uncached_indices = [i for i, h in enumerate(hashes) if h not in cached]
    print('  Need to compute %d new LLM features' % len(uncached_indices))

    if uncached_indices:
        new_entries = []
        for i in tqdm(uncached_indices, desc='  Extracting LLM features'):
            feats = extract_llm_features_single(texts[i], model=model)
            h = hashes[i]
            cached[h] = feats
            new_entries.append({'hash': h, **dict(zip(LLM_FEATURE_NAMES, feats))})

        # Append to cache file
        new_df = pd.DataFrame(new_entries).set_index('hash')
        if CACHE_PATH.exists():
            existing = pd.read_csv(CACHE_PATH, index_col='hash')
            combined = pd.concat([existing, new_df])
            combined = combined[~combined.index.duplicated(keep='last')]
            combined.to_csv(CACHE_PATH)
        else:
            new_df.to_csv(CACHE_PATH)
        print('  Saved %d new features to cache' % len(new_entries))

    # Build output array in order
    result = np.array([cached[h] for h in hashes])
    return result


print('Starting model training...')

df = pd.DataFrame(columns=['text', 'label'])

# Enron corpus — gold standard real corporate email (~33k)
if enron_csv.exists():
    print('Loading Enron email dataset...')
    df_enron = pd.read_csv(enron_csv)
    df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'})
    df_enron['label'] = df_enron['label'].str.strip().str.lower()
    df_enron = df_enron[['text', 'label']].dropna(subset=['text', 'label'])
    df_enron = df_enron[df_enron['label'].isin(['spam', 'ham'])]
    print('  Enron: %d emails (%d ham, %d spam)' % (
        len(df_enron),
        len(df_enron[df_enron['label'] == 'ham']),
        len(df_enron[df_enron['label'] == 'spam'])
    ))
    df = pd.concat([df, df_enron], ignore_index=True)
else:
    print('WARNING: Enron CSV not found at %s' % str(enron_csv))

# puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, SpamAssassin, Ling-Spam)
# Enron sub-corpus excluded to avoid duplicating emails already loaded above.
if puyang_parquet.exists():
    print('Loading puyang2025 seven-phishing-email-datasets...')
    df_puyang = pd.read_parquet(puyang_parquet)
    df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron']
    df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'})
    df_puyang = df_puyang[['text', 'label']].dropna(subset=['text', 'label'])
    print('  puyang2025 (Enron excluded): %d emails (%d ham, %d spam)' % (
        len(df_puyang),
        len(df_puyang[df_puyang['label'] == 'ham']),
        len(df_puyang[df_puyang['label'] == 'spam'])
    ))
    df = pd.concat([df, df_puyang], ignore_index=True)
else:
    print('WARNING: puyang2025 parquet not found at %s' % str(puyang_parquet))

# zefang phishing dataset — 18k emails labeled ham vs phishing
# Phishing is treated as the positive (spam) class for binary classification.
if zefang_parquet.exists():
    print('Loading zefang phishing dataset...')
    df_zefang = pd.read_parquet(zefang_parquet)
    df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'})
    df_zefang = df_zefang[['text', 'label']].dropna(subset=['text', 'label'])
    print('  zefang phishing: %d emails (%d ham, %d phishing->spam)' % (
        len(df_zefang),
        len(df_zefang[df_zefang['label'] == 'ham']),
        len(df_zefang[df_zefang['label'] == 'spam'])
    ))
    df = pd.concat([df, df_zefang], ignore_index=True)
else:
    print('WARNING: zefang parquet not found at %s' % str(zefang_parquet))

if len(df) == 0:
    raise RuntimeError('No training data found. Run the data download step first.')

print('Combined dataset: %d emails' % len(df))

# Load user feedback corrections (if any)
feedback_file = project_dir / 'data' / 'feedback' / 'feedback_log.csv'
if feedback_file.exists():
    print('Loading user feedback corrections...')
    df_feedback = pd.read_csv(feedback_file)
    if 'email_text' in df_feedback.columns and 'correct_label' in df_feedback.columns:
        df_feedback_clean = df_feedback[['email_text', 'correct_label']].dropna()
        df_feedback_clean = df_feedback_clean.rename(columns={'email_text': 'text', 'correct_label': 'label'})
        # Weight corrections 5x to amplify their impact
        df_feedback_weighted = pd.concat([df_feedback_clean] * 5, ignore_index=True)
        df = pd.concat([df, df_feedback_weighted], ignore_index=True)
        print('  Feedback: %d corrections (5x weighted = %d rows)' % (len(df_feedback_clean), len(df_feedback_weighted)))

# Deduplicate
before = len(df)
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
print('Total after dedup: %d emails (removed %d duplicates)' % (len(df), before - len(df)))

print('Preprocessing text...')
df['clean_text'] = df['text'].apply(preprocess_text)

# Build expanded TF-IDF features + metadata features + optional LLM features
print('Building features (3000 TF-IDF + 24 metadata + optional LLM)...')
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.90,
    sublinear_tf=True
)
X_tfidf = tfidf.fit_transform(df['clean_text'])

X_meta = compute_metadata_features(df['text'].values)

# Normalize metadata features to 0-1 range so they match the TF-IDF scale
# Without this, features like email_length (could be 1000+) would dominate
# over binary features like has_unsubscribe (just 0 or 1)
meta_scaler = MinMaxScaler()
X_meta_scaled = meta_scaler.fit_transform(X_meta)

meta_feature_names = ['exclamation_density', 'dollar_sign_count', 'caps_word_ratio',
                      'spam_phrase_count', 'ham_phrase_count', 'net_spam_context',
                      'url_count', 'html_tag_count', 'email_length',
                      'avg_sentence_length', 'capitalization_ratio',
                      'has_specific_date', 'has_specific_time', 'date_reference_count',
                      'has_unsubscribe', 'has_physical_address', 'has_proper_greeting',
                      'has_contact_info', 'registration_language_score',
                      'cta_to_info_ratio', 'shortener_url_ratio',
                      'legitimate_platform_count', 'gov_edu_url_count',
                      'question_mark_count']

# Check if Ollama is available for LLM feature extraction
# SKIP_LLM_TRAINING flag skips the slow LLM calls during training since
# TF-IDF + metadata features are already enough for good accuracy
if SKIP_LLM_TRAINING:
    print('SKIP_LLM_TRAINING=True - skipping LLM features for faster training')
    ollama_available = False
else:
    ollama_available = check_ollama_available()
if ollama_available:
    print('Ollama available - extracting LLM intent/tone features...')
    X_llm = get_or_compute_llm_features(df['text'].values)
    # Combine all feature matrices - I use hstack from scipy.sparse because
    # TF-IDF is already sparse and converting to dense would use too much memory
    X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled), csr_matrix(X_llm)])
    feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names + LLM_FEATURE_NAMES
else:
    print('Ollama not available - skipping LLM features')
    X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)])
    feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names
y = (df['label'] == 'spam').astype(int)

n_llm = len(LLM_FEATURE_NAMES) if ollama_available else 0
print('Total features: %d (%d TF-IDF + %d metadata + %d LLM)' % (len(feature_names), X_tfidf.shape[1], len(meta_feature_names), n_llm))

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.3, random_state=random_state, stratify=y
)

print('Running GridSearchCV...')
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [20, None],
}
# Use class_weight balanced to handle class imbalance
# Reduced grid from 8 combos to 4, and CV from 5 to 3 folds for faster training
rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print('Best params: %s' % str(grid_search.best_params_))
print('Best CV F1: %.4f' % grid_search.best_score_)

# Calibrate probabilities so the confidence percentages are more meaningful
# I found this technique on the sklearn docs - without it the RF probabilities
# can be overconfident, which makes the threshold slider in the app less useful
print('Calibrating probabilities with isotonic regression...')
calibrated_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5)
calibrated_rf.fit(X_train, y_train)

# Compute optimal threshold targeting 99% ham precision on test set
y_test_proba = calibrated_rf.predict_proba(X_test)[:, 1]
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba)
# Find the highest threshold where items classified as ham still have >= 99% precision
# Higher threshold = more emails classified as ham, so we want as high as possible
# while keeping ham predictions accurate
best_threshold = 0.50
for t in sorted(thresholds_pr, reverse=True):
    predicted_ham_mask = y_test_proba < t
    if predicted_ham_mask.sum() == 0:
        continue
    ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum()
    if ham_precision >= 0.99:
        best_threshold = t
        break
optimal_threshold = best_threshold

print('Optimal threshold (99%% ham precision): %.4f' % optimal_threshold)

y_pred = calibrated_rf.predict(X_test)
y_pred_optimal = (y_test_proba >= optimal_threshold).astype(int)

print('\nTest Set Performance (default 0.5 threshold):')
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

print('Test Set Performance (optimal %.2f threshold):' % optimal_threshold)
print(classification_report(y_test, y_pred_optimal, target_names=['Ham', 'Spam']))

# Save model artifacts
print('Saving model files...')
models_dir.mkdir(exist_ok=True)
joblib.dump(calibrated_rf, models_dir / 'random_forest_spam.joblib')
joblib.dump(best_rf, models_dir / 'random_forest_raw.joblib')
joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib')
joblib.dump(feature_names, models_dir / 'feature_names.joblib')
joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib')
joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib')

# Save training sample for LIME (200 random rows from training set)
X_train_dense = X_train.toarray()
rng = np.random.RandomState(random_state)
sample_idx = rng.choice(X_train_dense.shape[0], size=min(200, X_train_dense.shape[0]), replace=False)
training_sample = X_train_dense[sample_idx]
joblib.dump(training_sample, models_dir / 'training_sample.joblib')

# Save training config (tracks whether LLM features were used)
joblib.dump({
    'llm_features_used': ollama_available,
    'llm_feature_names': LLM_FEATURE_NAMES if ollama_available else [],
    'model_used': 'gemma3:1b',
}, models_dir / 'training_config.joblib')

print('\nSaved: calibrated model, vectorizer, feature_names, optimal_threshold (%.4f), training_sample %s, training_config to models/' % (optimal_threshold, str(training_sample.shape)))
print('All done!')