# Training pipeline for the spam classifier # ENGT 375 Project - Spring 2026 - ODU # # This script loads two email datasets, engineers features, compares three # models (Random Forest, Logistic Regression, SVM), builds a VotingClassifier # ensemble, and saves all artifacts needed by the Gradio app. import json import warnings import numpy as np import pandas as pd from pathlib import Path from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import LinearSVC from sklearn.calibration import CalibratedClassifierCV from sklearn.preprocessing import MinMaxScaler from sklearn.metrics import classification_report, precision_recall_curve from scipy.sparse import hstack, csr_matrix import joblib # Import our shared utility functions (preprocess_text, compute_metadata_features) from utils import preprocess_text, compute_metadata_features, META_FEATURE_NAMES warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=DeprecationWarning) # --------------------------------------------------------------------------- # Config # --------------------------------------------------------------------------- project_dir = Path(__file__).parent data_dir = project_dir / 'data' models_dir = project_dir / 'models' random_state = 42 # --------------------------------------------------------------------------- # Helper: collect accuracy, precision, recall, f1 into a dict # --------------------------------------------------------------------------- def get_metrics(y_true, y_pred): """Return a dict with accuracy, precision, recall, and f1 for the positive class (spam = 1). We pull these from classification_report so we don't have to compute them twice.""" report = classification_report(y_true, y_pred, output_dict=True) return { 'accuracy': round(report['accuracy'], 4), 'precision': round(report['1']['precision'], 4), 'recall': round(report['1']['recall'], 4), 'f1': round(report['1']['f1-score'], 4), } # =================================================================== # 1. DATA LOADING # =================================================================== print('=' * 60) print('STEP 1 — Loading datasets') print('=' * 60) # --- 1a. Enron corpus (~33k real corporate emails, gold-standard benchmark) - enron_csv = data_dir / 'raw' / 'enron' / 'enron_spam_data.csv' frames = [] if enron_csv.exists(): print('Loading Enron corpus ...') df_enron = pd.read_csv(enron_csv) df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'}) df_enron['label'] = df_enron['label'].str.strip().str.lower() df_enron = df_enron[df_enron['label'].isin(['ham', 'spam'])][['text', 'label']].dropna() frames.append(df_enron) print(' Enron: %d emails' % len(df_enron)) else: print('WARNING: Enron CSV not found at %s' % enron_csv) # --- 1b. puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, etc.) ---- # Enron sub-corpus excluded to avoid duplicating emails already loaded above. puyang_parquet = data_dir / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet' if puyang_parquet.exists(): print('Loading puyang2025 seven-phishing-email-datasets ...') df_puyang = pd.read_parquet(puyang_parquet) df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron'] df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'}) df_puyang = df_puyang[['text', 'label']].dropna() frames.append(df_puyang) print(' puyang2025 (Enron excluded): %d emails' % len(df_puyang)) else: print('WARNING: puyang2025 parquet not found at %s' % puyang_parquet) # --- 1c. zefang phishing dataset — 18k phishing-labeled emails -------------- # Phishing is treated as the positive (spam) class for binary classification. zefang_parquet = data_dir / 'raw' / 'zefang' / 'phishing_emails.parquet' if zefang_parquet.exists(): print('Loading zefang phishing dataset ...') df_zefang = pd.read_parquet(zefang_parquet) df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'}) df_zefang = df_zefang[['text', 'label']].dropna() frames.append(df_zefang) print(' zefang phishing: %d emails' % len(df_zefang)) else: print('WARNING: zefang parquet not found at %s' % zefang_parquet) if not frames: raise RuntimeError('No training data found! Check data/raw/ folder.') # --- 1d. Combine and deduplicate ------------------------------------------- df = pd.concat(frames, ignore_index=True) print('Combined: %d emails' % len(df)) before_dedup = len(df) df = df.drop_duplicates(subset=['text']).reset_index(drop=True) print('After dedup: %d emails (removed %d duplicates)' % (len(df), before_dedup - len(df))) ham_count = (df['label'] == 'ham').sum() spam_count = (df['label'] == 'spam').sum() print(' Ham: %d | Spam: %d' % (ham_count, spam_count)) if len(df) == 0: raise RuntimeError('No training data found! Check the data/ folder.') # =================================================================== # 2. FEATURE ENGINEERING # =================================================================== print() print('=' * 60) print('STEP 2 — Feature engineering') print('=' * 60) # 2a. Preprocess (stem, remove stopwords, etc.) print('Preprocessing text ...') df['clean_text'] = df['text'].apply(preprocess_text) # 2b. TF-IDF on the cleaned text print('Fitting TF-IDF (max 3000 features, 1-3 grams) ...') tfidf = TfidfVectorizer( max_features=3000, ngram_range=(1, 3), min_df=2, max_df=0.90, sublinear_tf=True, ) X_tfidf = tfidf.fit_transform(df['clean_text']) print(' TF-IDF shape: %s' % str(X_tfidf.shape)) # 2c. 24 metadata features (things like exclamation density, caps ratio, ...) print('Computing 24 metadata features ...') X_meta_raw = compute_metadata_features(df['text'].values) # Scale metadata to 0-1 so they match the TF-IDF scale meta_scaler = MinMaxScaler() X_meta_scaled = meta_scaler.fit_transform(X_meta_raw) # 2d. Combine TF-IDF (sparse) + metadata (dense -> sparse) into one matrix X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)]) feature_names = list(tfidf.get_feature_names_out()) + META_FEATURE_NAMES print('Total features: %d (%d TF-IDF + %d metadata)' % ( len(feature_names), X_tfidf.shape[1], len(META_FEATURE_NAMES) )) # 2e. Encode labels: 1 = spam, 0 = ham y = (df['label'] == 'spam').astype(int) # =================================================================== # 3. MODEL COMPARISON # =================================================================== print() print('=' * 60) print('STEP 3 — Train / test split + model comparison') print('=' * 60) X_train, X_test, y_train, y_test = train_test_split( X_combined, y, test_size=0.30, random_state=random_state, stratify=y, ) print('Train: %d | Test: %d' % (X_train.shape[0], X_test.shape[0])) # We'll store metrics for each model here all_metrics = {} # --- Random Forest ---------------------------------------------------------- print('\nTraining Random Forest ...') rf = RandomForestClassifier( n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=random_state, ) rf.fit(X_train, y_train) rf_pred = rf.predict(X_test) print(classification_report(y_test, rf_pred, target_names=['Ham', 'Spam'])) all_metrics['RandomForest'] = get_metrics(y_test, rf_pred) # --- Logistic Regression ---------------------------------------------------- print('Training Logistic Regression ...') lr = LogisticRegression( max_iter=1000, class_weight='balanced', random_state=random_state, ) lr.fit(X_train, y_train) lr_pred = lr.predict(X_test) print(classification_report(y_test, lr_pred, target_names=['Ham', 'Spam'])) all_metrics['LogisticRegression'] = get_metrics(y_test, lr_pred) # --- SVM (linear kernel) ---------------------------------------------------- # We use LinearSVC + CalibratedClassifierCV instead of SVC(probability=True) # because SVC is O(n^2) and would take hours on 100K emails. # CalibratedClassifierCV wraps LinearSVC to add predict_proba() support, # which is needed for the soft-voting ensemble. print('Training SVM (LinearSVC + calibration for probabilities) ...') base_svm = LinearSVC( class_weight='balanced', max_iter=2000, random_state=random_state, ) svm = CalibratedClassifierCV(base_svm, cv=5) svm.fit(X_train, y_train) svm_pred = svm.predict(X_test) print(classification_report(y_test, svm_pred, target_names=['Ham', 'Spam'])) all_metrics['SVM'] = get_metrics(y_test, svm_pred) # =================================================================== # 4. ENSEMBLE (VotingClassifier) # =================================================================== print() print('=' * 60) print('STEP 4 — Voting ensemble (soft voting)') print('=' * 60) # Build fresh estimators (not the already-fitted ones) for VotingClassifier ensemble = VotingClassifier( estimators=[ ('rf', RandomForestClassifier( n_estimators=200, n_jobs=-1, class_weight='balanced', random_state=random_state)), ('lr', LogisticRegression( max_iter=1000, class_weight='balanced', random_state=random_state)), ('svm', CalibratedClassifierCV( LinearSVC(class_weight='balanced', max_iter=2000, random_state=random_state), cv=5)), ], voting='soft', n_jobs=-1, ) print('Fitting ensemble (all three models from scratch) ...') ensemble.fit(X_train, y_train) ens_pred = ensemble.predict(X_test) print(classification_report(y_test, ens_pred, target_names=['Ham', 'Spam'])) all_metrics['VotingEnsemble'] = get_metrics(y_test, ens_pred) # --- Optimal threshold via precision-recall curve --- # We want to find the threshold where ham precision >= 99%. # That means: when the model says "ham", it's right at least 99% of the time. print('Computing optimal threshold (target: 99%% ham precision) ...') y_test_proba = ensemble.predict_proba(X_test)[:, 1] # P(spam) precision_arr, recall_arr, thresholds_pr = precision_recall_curve(y_test, y_test_proba) optimal_threshold = 0.50 # fallback for t in sorted(thresholds_pr, reverse=True): # Emails with P(spam) < t are classified as ham predicted_ham_mask = y_test_proba < t if predicted_ham_mask.sum() == 0: continue # Of those predicted-ham emails, how many are actually ham? ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum() if ham_precision >= 0.99: optimal_threshold = t break print('Optimal threshold: %.4f' % optimal_threshold) # =================================================================== # 5. SAVE ARTIFACTS # =================================================================== print() print('=' * 60) print('STEP 5 — Saving artifacts to models/') print('=' * 60) models_dir.mkdir(exist_ok=True) # 5a. Voting model joblib.dump(ensemble, models_dir / 'voting_model.joblib') print(' Saved voting_model.joblib') # 5b. TF-IDF vectorizer joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib') print(' Saved tfidf_vectorizer.joblib') # 5c. Metadata scaler joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib') print(' Saved meta_scaler.joblib') # 5d. Feature names list (TF-IDF names + META_FEATURE_NAMES) joblib.dump(feature_names, models_dir / 'feature_names.joblib') print(' Saved feature_names.joblib (%d names)' % len(feature_names)) # 5e. Optimal threshold joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib') print(' Saved optimal_threshold.joblib (%.4f)' % optimal_threshold) # 5f. Training sample (200 dense rows for LIME / SHAP explanations) rng = np.random.RandomState(random_state) sample_size = min(200, X_train.shape[0]) sample_idx = rng.choice(X_train.shape[0], size=sample_size, replace=False) training_sample = X_train[sample_idx].toarray() # convert sparse -> dense joblib.dump(training_sample, models_dir / 'training_sample.joblib') print(' Saved training_sample.joblib shape=%s' % str(training_sample.shape)) # 5g. Training report (JSON with metrics for every model) report = { 'random_state': random_state, 'train_size': int(X_train.shape[0]), 'test_size': int(X_test.shape[0]), 'total_features': len(feature_names), 'optimal_threshold': round(optimal_threshold, 4), 'models': all_metrics, } report_path = models_dir / 'training_report.json' with open(report_path, 'w') as f: json.dump(report, f, indent=2) print(' Saved training_report.json') print() print('=' * 60) print('DONE — all 7 artifacts saved to models/') print('=' * 60)