spam-xai-model-v2 / retrain_student.py
VoltageVagabond's picture
Upload folder using huggingface_hub
960ec3d verified
Raw
History Blame
16.6 kB
# Retrain the spam classifier model
# ENGT 375 Project - Spring 2026 - ODU
# Loads Kaggle spam dataset + GitHub email-dataset
import re
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
# I found CalibratedClassifierCV in the sklearn docs - it makes the probability
# predictions more accurate instead of just using the raw RF outputs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, f1_score, precision_recall_curve
# MinMaxScaler normalizes metadata features to 0-1 range so they're on the same
# scale as the TF-IDF features (learned about scaling importance from Module 7A kNN)
from sklearn.preprocessing import MinMaxScaler
# hstack and csr_matrix let me combine the TF-IDF sparse matrix with the metadata
# features without converting everything to a dense array (saves a lot of memory)
from scipy.sparse import hstack, csr_matrix
import joblib
import json
import hashlib
import requests
from tqdm import tqdm
from utils_student import (preprocess_text, compute_metadata_features,
spam_context_phrases, ham_context_phrases,
registration_phrases, url_shorteners,
legitimate_platforms, OLLAMA_API, LLM_FEATURE_NAMES)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=DeprecationWarning)
# Set up folder paths
project_dir = Path(__file__).parent
data_dir = project_dir / 'data' / 'processed'
raw_dir = project_dir / 'data' / 'raw'
models_dir = project_dir / 'models'
random_state = 42
enron_csv = project_dir / 'data' / 'raw' / 'enron' / 'enron_spam_data.csv'
puyang_parquet = project_dir / 'data' / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet'
zefang_parquet = project_dir / 'data' / 'raw' / 'zefang' / 'phishing_emails.parquet'
SKIP_LLM_TRAINING = True # LLM features too slow for large datasets
def check_ollama_available(model='qwen3.5:2b'):
"""Check if Ollama is running and has the specified model."""
try:
resp = requests.get('http://localhost:11434/api/tags', timeout=2)
if resp.status_code == 200:
models = [m['name'] for m in resp.json().get('models', [])]
return any(model in m for m in models)
except Exception:
pass
return False
def extract_llm_features_single(text, model='qwen3.5:2b'):
"""Extract intent and tone features for a single email via Ollama."""
truncated = text[:500]
prompt = (
'Rate this email on these dimensions (0.0 to 1.0).\n'
'Respond with ONLY valid JSON: {"promotional": X, "transactional": X, '
'"personal": X, "phishing": X, "urgency": X, "formality": X}\n'
'/no_think\n\n'
'Email: "%s"' % truncated
)
try:
resp = requests.post(OLLAMA_API, json={
'model': model,
'messages': [{'role': 'user', 'content': prompt}],
'stream': False,
'think': False,
'options': {'temperature': 0.1, 'num_predict': 100}
}, timeout=30)
if resp.status_code == 200:
content = resp.json().get('message', {}).get('content', '')
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
json_match = re.search(r'\{[^}]+\}', content)
if json_match:
data = json.loads(json_match.group())
return [
float(data.get('promotional', 0.5)),
float(data.get('transactional', 0.5)),
float(data.get('personal', 0.5)),
float(data.get('phishing', 0.5)),
float(data.get('urgency', 0.5)),
float(data.get('formality', 0.5)),
]
except Exception:
pass
return [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
# I use a hash of each email text as a cache key so I don't have to re-run
# the LLM for emails I've already processed (the LLM calls are really slow)
def get_text_hash(text):
"""Get a stable hash for an email text."""
return hashlib.sha256(text.encode('utf-8', errors='replace')).hexdigest()[:16]
CACHE_PATH = data_dir / 'llm_features_cache.csv'
def get_or_compute_llm_features(texts, model='qwen3.5:2b'):
"""Load cached LLM features, compute missing ones, return combined array."""
hashes = [get_text_hash(t) for t in texts]
# Load existing cache
cached = {}
if CACHE_PATH.exists():
cache_df = pd.read_csv(CACHE_PATH, index_col='hash')
for h in cache_df.index:
cached[h] = cache_df.loc[h, LLM_FEATURE_NAMES].values.tolist()
print(' Loaded %d cached LLM features' % len(cached))
# Identify uncached
uncached_indices = [i for i, h in enumerate(hashes) if h not in cached]
print(' Need to compute %d new LLM features' % len(uncached_indices))
if uncached_indices:
new_entries = []
for i in tqdm(uncached_indices, desc=' Extracting LLM features'):
feats = extract_llm_features_single(texts[i], model=model)
h = hashes[i]
cached[h] = feats
new_entries.append({'hash': h, **dict(zip(LLM_FEATURE_NAMES, feats))})
# Append to cache file
new_df = pd.DataFrame(new_entries).set_index('hash')
if CACHE_PATH.exists():
existing = pd.read_csv(CACHE_PATH, index_col='hash')
combined = pd.concat([existing, new_df])
combined = combined[~combined.index.duplicated(keep='last')]
combined.to_csv(CACHE_PATH)
else:
new_df.to_csv(CACHE_PATH)
print(' Saved %d new features to cache' % len(new_entries))
# Build output array in order
result = np.array([cached[h] for h in hashes])
return result
print('Starting model training...')
df = pd.DataFrame(columns=['text', 'label'])
# Enron corpus — gold standard real corporate email (~33k)
if enron_csv.exists():
print('Loading Enron email dataset...')
df_enron = pd.read_csv(enron_csv)
df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'})
df_enron['label'] = df_enron['label'].str.strip().str.lower()
df_enron = df_enron[['text', 'label']].dropna(subset=['text', 'label'])
df_enron = df_enron[df_enron['label'].isin(['spam', 'ham'])]
print(' Enron: %d emails (%d ham, %d spam)' % (
len(df_enron),
len(df_enron[df_enron['label'] == 'ham']),
len(df_enron[df_enron['label'] == 'spam'])
))
df = pd.concat([df, df_enron], ignore_index=True)
else:
print('WARNING: Enron CSV not found at %s' % str(enron_csv))
# puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, SpamAssassin, Ling-Spam)
# Enron sub-corpus excluded to avoid duplicating emails already loaded above.
if puyang_parquet.exists():
print('Loading puyang2025 seven-phishing-email-datasets...')
df_puyang = pd.read_parquet(puyang_parquet)
df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron']
df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'})
df_puyang = df_puyang[['text', 'label']].dropna(subset=['text', 'label'])
print(' puyang2025 (Enron excluded): %d emails (%d ham, %d spam)' % (
len(df_puyang),
len(df_puyang[df_puyang['label'] == 'ham']),
len(df_puyang[df_puyang['label'] == 'spam'])
))
df = pd.concat([df, df_puyang], ignore_index=True)
else:
print('WARNING: puyang2025 parquet not found at %s' % str(puyang_parquet))
# zefang phishing dataset — 18k emails labeled ham vs phishing
# Phishing is treated as the positive (spam) class for binary classification.
if zefang_parquet.exists():
print('Loading zefang phishing dataset...')
df_zefang = pd.read_parquet(zefang_parquet)
df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'})
df_zefang = df_zefang[['text', 'label']].dropna(subset=['text', 'label'])
print(' zefang phishing: %d emails (%d ham, %d phishing->spam)' % (
len(df_zefang),
len(df_zefang[df_zefang['label'] == 'ham']),
len(df_zefang[df_zefang['label'] == 'spam'])
))
df = pd.concat([df, df_zefang], ignore_index=True)
else:
print('WARNING: zefang parquet not found at %s' % str(zefang_parquet))
if len(df) == 0:
raise RuntimeError('No training data found. Run the data download step first.')
print('Combined dataset: %d emails' % len(df))
# Load user feedback corrections (if any)
feedback_file = project_dir / 'data' / 'feedback' / 'feedback_log.csv'
if feedback_file.exists():
print('Loading user feedback corrections...')
df_feedback = pd.read_csv(feedback_file)
if 'email_text' in df_feedback.columns and 'correct_label' in df_feedback.columns:
df_feedback_clean = df_feedback[['email_text', 'correct_label']].dropna()
df_feedback_clean = df_feedback_clean.rename(columns={'email_text': 'text', 'correct_label': 'label'})
# Weight corrections 5x to amplify their impact
df_feedback_weighted = pd.concat([df_feedback_clean] * 5, ignore_index=True)
df = pd.concat([df, df_feedback_weighted], ignore_index=True)
print(' Feedback: %d corrections (5x weighted = %d rows)' % (len(df_feedback_clean), len(df_feedback_weighted)))
# Deduplicate
before = len(df)
df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
print('Total after dedup: %d emails (removed %d duplicates)' % (len(df), before - len(df)))
print('Preprocessing text...')
df['clean_text'] = df['text'].apply(preprocess_text)
# Build expanded TF-IDF features + metadata features + optional LLM features
print('Building features (3000 TF-IDF + 24 metadata + optional LLM)...')
tfidf = TfidfVectorizer(
max_features=3000,
ngram_range=(1, 3),
min_df=2,
max_df=0.90,
sublinear_tf=True
)
X_tfidf = tfidf.fit_transform(df['clean_text'])
X_meta = compute_metadata_features(df['text'].values)
# Normalize metadata features to 0-1 range so they match the TF-IDF scale
# Without this, features like email_length (could be 1000+) would dominate
# over binary features like has_unsubscribe (just 0 or 1)
meta_scaler = MinMaxScaler()
X_meta_scaled = meta_scaler.fit_transform(X_meta)
meta_feature_names = ['exclamation_density', 'dollar_sign_count', 'caps_word_ratio',
'spam_phrase_count', 'ham_phrase_count', 'net_spam_context',
'url_count', 'html_tag_count', 'email_length',
'avg_sentence_length', 'capitalization_ratio',
'has_specific_date', 'has_specific_time', 'date_reference_count',
'has_unsubscribe', 'has_physical_address', 'has_proper_greeting',
'has_contact_info', 'registration_language_score',
'cta_to_info_ratio', 'shortener_url_ratio',
'legitimate_platform_count', 'gov_edu_url_count',
'question_mark_count']
# Check if Ollama is available for LLM feature extraction
# SKIP_LLM_TRAINING flag skips the slow LLM calls during training since
# TF-IDF + metadata features are already enough for good accuracy
if SKIP_LLM_TRAINING:
print('SKIP_LLM_TRAINING=True - skipping LLM features for faster training')
ollama_available = False
else:
ollama_available = check_ollama_available()
if ollama_available:
print('Ollama available - extracting LLM intent/tone features...')
X_llm = get_or_compute_llm_features(df['text'].values)
# Combine all feature matrices - I use hstack from scipy.sparse because
# TF-IDF is already sparse and converting to dense would use too much memory
X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled), csr_matrix(X_llm)])
feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names + LLM_FEATURE_NAMES
else:
print('Ollama not available - skipping LLM features')
X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)])
feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names
y = (df['label'] == 'spam').astype(int)
n_llm = len(LLM_FEATURE_NAMES) if ollama_available else 0
print('Total features: %d (%d TF-IDF + %d metadata + %d LLM)' % (len(feature_names), X_tfidf.shape[1], len(meta_feature_names), n_llm))
# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
X_combined, y, test_size=0.3, random_state=random_state, stratify=y
)
print('Running GridSearchCV...')
param_grid = {
'n_estimators': [100, 200],
'max_depth': [20, None],
}
# Use class_weight balanced to handle class imbalance
# Reduced grid from 8 combos to 4, and CV from 5 to 3 folds for faster training
rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced')
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_
print('Best params: %s' % str(grid_search.best_params_))
print('Best CV F1: %.4f' % grid_search.best_score_)
# Calibrate probabilities so the confidence percentages are more meaningful
# I found this technique on the sklearn docs - without it the RF probabilities
# can be overconfident, which makes the threshold slider in the app less useful
print('Calibrating probabilities with isotonic regression...')
calibrated_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5)
calibrated_rf.fit(X_train, y_train)
# Compute optimal threshold targeting 99% ham precision on test set
y_test_proba = calibrated_rf.predict_proba(X_test)[:, 1]
precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba)
# Find the highest threshold where items classified as ham still have >= 99% precision
# Higher threshold = more emails classified as ham, so we want as high as possible
# while keeping ham predictions accurate
best_threshold = 0.50
for t in sorted(thresholds_pr, reverse=True):
predicted_ham_mask = y_test_proba < t
if predicted_ham_mask.sum() == 0:
continue
ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum()
if ham_precision >= 0.99:
best_threshold = t
break
optimal_threshold = best_threshold
print('Optimal threshold (99%% ham precision): %.4f' % optimal_threshold)
y_pred = calibrated_rf.predict(X_test)
y_pred_optimal = (y_test_proba >= optimal_threshold).astype(int)
print('\nTest Set Performance (default 0.5 threshold):')
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print('Test Set Performance (optimal %.2f threshold):' % optimal_threshold)
print(classification_report(y_test, y_pred_optimal, target_names=['Ham', 'Spam']))
# Save model artifacts
print('Saving model files...')
models_dir.mkdir(exist_ok=True)
joblib.dump(calibrated_rf, models_dir / 'random_forest_spam.joblib')
joblib.dump(best_rf, models_dir / 'random_forest_raw.joblib')
joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib')
joblib.dump(feature_names, models_dir / 'feature_names.joblib')
joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib')
joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib')
# Save training sample for LIME (200 random rows from training set)
X_train_dense = X_train.toarray()
rng = np.random.RandomState(random_state)
sample_idx = rng.choice(X_train_dense.shape[0], size=min(200, X_train_dense.shape[0]), replace=False)
training_sample = X_train_dense[sample_idx]
joblib.dump(training_sample, models_dir / 'training_sample.joblib')
# Save training config (tracks whether LLM features were used)
joblib.dump({
'llm_features_used': ollama_available,
'llm_feature_names': LLM_FEATURE_NAMES if ollama_available else [],
'model_used': 'gemma3:1b',
}, models_dir / 'training_config.joblib')
print('\nSaved: calibrated model, vectorizer, feature_names, optimal_threshold (%.4f), training_sample %s, training_config to models/' % (optimal_threshold, str(training_sample.shape)))
print('All done!')