# Retrain the spam classifier model # ENGT 375 Project - Spring 2026 - ODU # Loads Kaggle spam dataset + GitHub email-dataset import re import warnings import numpy as np import pandas as pd from pathlib import Path from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier # I found CalibratedClassifierCV in the sklearn docs - it makes the probability # predictions more accurate instead of just using the raw RF outputs from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import classification_report, f1_score, precision_recall_curve # MinMaxScaler normalizes metadata features to 0-1 range so they're on the same # scale as the TF-IDF features (learned about scaling importance from Module 7A kNN) from sklearn.preprocessing import MinMaxScaler # hstack and csr_matrix let me combine the TF-IDF sparse matrix with the metadata # features without converting everything to a dense array (saves a lot of memory) from scipy.sparse import hstack, csr_matrix import joblib import json import hashlib import requests from tqdm import tqdm from utils_student import (preprocess_text, compute_metadata_features, spam_context_phrases, ham_context_phrases, registration_phrases, url_shorteners, legitimate_platforms, OLLAMA_API, LLM_FEATURE_NAMES) warnings.filterwarnings('ignore', category=FutureWarning) warnings.filterwarnings('ignore', category=DeprecationWarning) # Set up folder paths project_dir = Path(__file__).parent data_dir = project_dir / 'data' / 'processed' raw_dir = project_dir / 'data' / 'raw' models_dir = project_dir / 'models' random_state = 42 enron_csv = project_dir / 'data' / 'raw' / 'enron' / 'enron_spam_data.csv' puyang_parquet = project_dir / 'data' / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet' zefang_parquet = project_dir / 'data' / 'raw' / 'zefang' / 'phishing_emails.parquet' SKIP_LLM_TRAINING = True # LLM features too slow for large datasets def check_ollama_available(model='qwen3.5:2b'): """Check if Ollama is running and has the specified model.""" try: resp = requests.get('http://localhost:11434/api/tags', timeout=2) if resp.status_code == 200: models = [m['name'] for m in resp.json().get('models', [])] return any(model in m for m in models) except Exception: pass return False def extract_llm_features_single(text, model='qwen3.5:2b'): """Extract intent and tone features for a single email via Ollama.""" truncated = text[:500] prompt = ( 'Rate this email on these dimensions (0.0 to 1.0).\n' 'Respond with ONLY valid JSON: {"promotional": X, "transactional": X, ' '"personal": X, "phishing": X, "urgency": X, "formality": X}\n' '/no_think\n\n' 'Email: "%s"' % truncated ) try: resp = requests.post(OLLAMA_API, json={ 'model': model, 'messages': [{'role': 'user', 'content': prompt}], 'stream': False, 'think': False, 'options': {'temperature': 0.1, 'num_predict': 100} }, timeout=30) if resp.status_code == 200: content = resp.json().get('message', {}).get('content', '') content = re.sub(r'.*?', '', content, flags=re.DOTALL).strip() json_match = re.search(r'\{[^}]+\}', content) if json_match: data = json.loads(json_match.group()) return [ float(data.get('promotional', 0.5)), float(data.get('transactional', 0.5)), float(data.get('personal', 0.5)), float(data.get('phishing', 0.5)), float(data.get('urgency', 0.5)), float(data.get('formality', 0.5)), ] except Exception: pass return [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] # I use a hash of each email text as a cache key so I don't have to re-run # the LLM for emails I've already processed (the LLM calls are really slow) def get_text_hash(text): """Get a stable hash for an email text.""" return hashlib.sha256(text.encode('utf-8', errors='replace')).hexdigest()[:16] CACHE_PATH = data_dir / 'llm_features_cache.csv' def get_or_compute_llm_features(texts, model='qwen3.5:2b'): """Load cached LLM features, compute missing ones, return combined array.""" hashes = [get_text_hash(t) for t in texts] # Load existing cache cached = {} if CACHE_PATH.exists(): cache_df = pd.read_csv(CACHE_PATH, index_col='hash') for h in cache_df.index: cached[h] = cache_df.loc[h, LLM_FEATURE_NAMES].values.tolist() print(' Loaded %d cached LLM features' % len(cached)) # Identify uncached uncached_indices = [i for i, h in enumerate(hashes) if h not in cached] print(' Need to compute %d new LLM features' % len(uncached_indices)) if uncached_indices: new_entries = [] for i in tqdm(uncached_indices, desc=' Extracting LLM features'): feats = extract_llm_features_single(texts[i], model=model) h = hashes[i] cached[h] = feats new_entries.append({'hash': h, **dict(zip(LLM_FEATURE_NAMES, feats))}) # Append to cache file new_df = pd.DataFrame(new_entries).set_index('hash') if CACHE_PATH.exists(): existing = pd.read_csv(CACHE_PATH, index_col='hash') combined = pd.concat([existing, new_df]) combined = combined[~combined.index.duplicated(keep='last')] combined.to_csv(CACHE_PATH) else: new_df.to_csv(CACHE_PATH) print(' Saved %d new features to cache' % len(new_entries)) # Build output array in order result = np.array([cached[h] for h in hashes]) return result print('Starting model training...') df = pd.DataFrame(columns=['text', 'label']) # Enron corpus — gold standard real corporate email (~33k) if enron_csv.exists(): print('Loading Enron email dataset...') df_enron = pd.read_csv(enron_csv) df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'}) df_enron['label'] = df_enron['label'].str.strip().str.lower() df_enron = df_enron[['text', 'label']].dropna(subset=['text', 'label']) df_enron = df_enron[df_enron['label'].isin(['spam', 'ham'])] print(' Enron: %d emails (%d ham, %d spam)' % ( len(df_enron), len(df_enron[df_enron['label'] == 'ham']), len(df_enron[df_enron['label'] == 'spam']) )) df = pd.concat([df, df_enron], ignore_index=True) else: print('WARNING: Enron CSV not found at %s' % str(enron_csv)) # puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, SpamAssassin, Ling-Spam) # Enron sub-corpus excluded to avoid duplicating emails already loaded above. if puyang_parquet.exists(): print('Loading puyang2025 seven-phishing-email-datasets...') df_puyang = pd.read_parquet(puyang_parquet) df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron'] df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'}) df_puyang = df_puyang[['text', 'label']].dropna(subset=['text', 'label']) print(' puyang2025 (Enron excluded): %d emails (%d ham, %d spam)' % ( len(df_puyang), len(df_puyang[df_puyang['label'] == 'ham']), len(df_puyang[df_puyang['label'] == 'spam']) )) df = pd.concat([df, df_puyang], ignore_index=True) else: print('WARNING: puyang2025 parquet not found at %s' % str(puyang_parquet)) # zefang phishing dataset — 18k emails labeled ham vs phishing # Phishing is treated as the positive (spam) class for binary classification. if zefang_parquet.exists(): print('Loading zefang phishing dataset...') df_zefang = pd.read_parquet(zefang_parquet) df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'}) df_zefang = df_zefang[['text', 'label']].dropna(subset=['text', 'label']) print(' zefang phishing: %d emails (%d ham, %d phishing->spam)' % ( len(df_zefang), len(df_zefang[df_zefang['label'] == 'ham']), len(df_zefang[df_zefang['label'] == 'spam']) )) df = pd.concat([df, df_zefang], ignore_index=True) else: print('WARNING: zefang parquet not found at %s' % str(zefang_parquet)) if len(df) == 0: raise RuntimeError('No training data found. Run the data download step first.') print('Combined dataset: %d emails' % len(df)) # Load user feedback corrections (if any) feedback_file = project_dir / 'data' / 'feedback' / 'feedback_log.csv' if feedback_file.exists(): print('Loading user feedback corrections...') df_feedback = pd.read_csv(feedback_file) if 'email_text' in df_feedback.columns and 'correct_label' in df_feedback.columns: df_feedback_clean = df_feedback[['email_text', 'correct_label']].dropna() df_feedback_clean = df_feedback_clean.rename(columns={'email_text': 'text', 'correct_label': 'label'}) # Weight corrections 5x to amplify their impact df_feedback_weighted = pd.concat([df_feedback_clean] * 5, ignore_index=True) df = pd.concat([df, df_feedback_weighted], ignore_index=True) print(' Feedback: %d corrections (5x weighted = %d rows)' % (len(df_feedback_clean), len(df_feedback_weighted))) # Deduplicate before = len(df) df = df.drop_duplicates(subset=['text']).reset_index(drop=True) print('Total after dedup: %d emails (removed %d duplicates)' % (len(df), before - len(df))) print('Preprocessing text...') df['clean_text'] = df['text'].apply(preprocess_text) # Build expanded TF-IDF features + metadata features + optional LLM features print('Building features (3000 TF-IDF + 24 metadata + optional LLM)...') tfidf = TfidfVectorizer( max_features=3000, ngram_range=(1, 3), min_df=2, max_df=0.90, sublinear_tf=True ) X_tfidf = tfidf.fit_transform(df['clean_text']) X_meta = compute_metadata_features(df['text'].values) # Normalize metadata features to 0-1 range so they match the TF-IDF scale # Without this, features like email_length (could be 1000+) would dominate # over binary features like has_unsubscribe (just 0 or 1) meta_scaler = MinMaxScaler() X_meta_scaled = meta_scaler.fit_transform(X_meta) meta_feature_names = ['exclamation_density', 'dollar_sign_count', 'caps_word_ratio', 'spam_phrase_count', 'ham_phrase_count', 'net_spam_context', 'url_count', 'html_tag_count', 'email_length', 'avg_sentence_length', 'capitalization_ratio', 'has_specific_date', 'has_specific_time', 'date_reference_count', 'has_unsubscribe', 'has_physical_address', 'has_proper_greeting', 'has_contact_info', 'registration_language_score', 'cta_to_info_ratio', 'shortener_url_ratio', 'legitimate_platform_count', 'gov_edu_url_count', 'question_mark_count'] # Check if Ollama is available for LLM feature extraction # SKIP_LLM_TRAINING flag skips the slow LLM calls during training since # TF-IDF + metadata features are already enough for good accuracy if SKIP_LLM_TRAINING: print('SKIP_LLM_TRAINING=True - skipping LLM features for faster training') ollama_available = False else: ollama_available = check_ollama_available() if ollama_available: print('Ollama available - extracting LLM intent/tone features...') X_llm = get_or_compute_llm_features(df['text'].values) # Combine all feature matrices - I use hstack from scipy.sparse because # TF-IDF is already sparse and converting to dense would use too much memory X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled), csr_matrix(X_llm)]) feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names + LLM_FEATURE_NAMES else: print('Ollama not available - skipping LLM features') X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)]) feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names y = (df['label'] == 'spam').astype(int) n_llm = len(LLM_FEATURE_NAMES) if ollama_available else 0 print('Total features: %d (%d TF-IDF + %d metadata + %d LLM)' % (len(feature_names), X_tfidf.shape[1], len(meta_feature_names), n_llm)) # Split into train/test X_train, X_test, y_train, y_test = train_test_split( X_combined, y, test_size=0.3, random_state=random_state, stratify=y ) print('Running GridSearchCV...') param_grid = { 'n_estimators': [100, 200], 'max_depth': [20, None], } # Use class_weight balanced to handle class imbalance # Reduced grid from 8 combos to 4, and CV from 5 to 3 folds for faster training rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced') grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1) grid_search.fit(X_train, y_train) best_rf = grid_search.best_estimator_ print('Best params: %s' % str(grid_search.best_params_)) print('Best CV F1: %.4f' % grid_search.best_score_) # Calibrate probabilities so the confidence percentages are more meaningful # I found this technique on the sklearn docs - without it the RF probabilities # can be overconfident, which makes the threshold slider in the app less useful print('Calibrating probabilities with isotonic regression...') calibrated_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5) calibrated_rf.fit(X_train, y_train) # Compute optimal threshold targeting 99% ham precision on test set y_test_proba = calibrated_rf.predict_proba(X_test)[:, 1] precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba) # Find the highest threshold where items classified as ham still have >= 99% precision # Higher threshold = more emails classified as ham, so we want as high as possible # while keeping ham predictions accurate best_threshold = 0.50 for t in sorted(thresholds_pr, reverse=True): predicted_ham_mask = y_test_proba < t if predicted_ham_mask.sum() == 0: continue ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum() if ham_precision >= 0.99: best_threshold = t break optimal_threshold = best_threshold print('Optimal threshold (99%% ham precision): %.4f' % optimal_threshold) y_pred = calibrated_rf.predict(X_test) y_pred_optimal = (y_test_proba >= optimal_threshold).astype(int) print('\nTest Set Performance (default 0.5 threshold):') print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam'])) print('Test Set Performance (optimal %.2f threshold):' % optimal_threshold) print(classification_report(y_test, y_pred_optimal, target_names=['Ham', 'Spam'])) # Save model artifacts print('Saving model files...') models_dir.mkdir(exist_ok=True) joblib.dump(calibrated_rf, models_dir / 'random_forest_spam.joblib') joblib.dump(best_rf, models_dir / 'random_forest_raw.joblib') joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib') joblib.dump(feature_names, models_dir / 'feature_names.joblib') joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib') joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib') # Save training sample for LIME (200 random rows from training set) X_train_dense = X_train.toarray() rng = np.random.RandomState(random_state) sample_idx = rng.choice(X_train_dense.shape[0], size=min(200, X_train_dense.shape[0]), replace=False) training_sample = X_train_dense[sample_idx] joblib.dump(training_sample, models_dir / 'training_sample.joblib') # Save training config (tracks whether LLM features were used) joblib.dump({ 'llm_features_used': ollama_available, 'llm_feature_names': LLM_FEATURE_NAMES if ollama_available else [], 'model_used': 'gemma3:1b', }, models_dir / 'training_config.joblib') print('\nSaved: calibrated model, vectorizer, feature_names, optimal_threshold (%.4f), training_sample %s, training_config to models/' % (optimal_threshold, str(training_sample.shape))) print('All done!')