Upload folder using huggingface_hub

960ec3d verified 2 months ago

16.6 kB

	# Retrain the spam classifier model
	# ENGT 375 Project - Spring 2026 - ODU
	# Loads Kaggle spam dataset + GitHub email-dataset

	import re
	import warnings
	import numpy as np
	import pandas as pd
	from pathlib import Path
	from sklearn.model_selection import train_test_split, GridSearchCV
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestClassifier
	# I found CalibratedClassifierCV in the sklearn docs - it makes the probability
	# predictions more accurate instead of just using the raw RF outputs
	from sklearn.calibration import CalibratedClassifierCV
	from sklearn.metrics import classification_report, f1_score, precision_recall_curve
	# MinMaxScaler normalizes metadata features to 0-1 range so they're on the same
	# scale as the TF-IDF features (learned about scaling importance from Module 7A kNN)
	from sklearn.preprocessing import MinMaxScaler
	# hstack and csr_matrix let me combine the TF-IDF sparse matrix with the metadata
	# features without converting everything to a dense array (saves a lot of memory)
	from scipy.sparse import hstack, csr_matrix
	import joblib
	import json
	import hashlib
	import requests
	from tqdm import tqdm
	from utils_student import (preprocess_text, compute_metadata_features,
	spam_context_phrases, ham_context_phrases,
	registration_phrases, url_shorteners,
	legitimate_platforms, OLLAMA_API, LLM_FEATURE_NAMES)

	warnings.filterwarnings('ignore', category=FutureWarning)
	warnings.filterwarnings('ignore', category=DeprecationWarning)

	# Set up folder paths
	project_dir = Path(__file__).parent
	data_dir = project_dir / 'data' / 'processed'
	raw_dir = project_dir / 'data' / 'raw'
	models_dir = project_dir / 'models'
	random_state = 42

	enron_csv = project_dir / 'data' / 'raw' / 'enron' / 'enron_spam_data.csv'
	puyang_parquet = project_dir / 'data' / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet'
	zefang_parquet = project_dir / 'data' / 'raw' / 'zefang' / 'phishing_emails.parquet'
	SKIP_LLM_TRAINING = True # LLM features too slow for large datasets

	def check_ollama_available(model='qwen3.5:2b'):
	"""Check if Ollama is running and has the specified model."""
	try:
	resp = requests.get('http://localhost:11434/api/tags', timeout=2)
	if resp.status_code == 200:
	models = [m['name'] for m in resp.json().get('models', [])]
	return any(model in m for m in models)
	except Exception:
	pass
	return False


	def extract_llm_features_single(text, model='qwen3.5:2b'):
	"""Extract intent and tone features for a single email via Ollama."""
	truncated = text[:500]
	prompt = (
	'Rate this email on these dimensions (0.0 to 1.0).\n'
	'Respond with ONLY valid JSON: {"promotional": X, "transactional": X, '
	'"personal": X, "phishing": X, "urgency": X, "formality": X}\n'
	'/no_think\n\n'
	'Email: "%s"' % truncated
	)
	try:
	resp = requests.post(OLLAMA_API, json={
	'model': model,
	'messages': [{'role': 'user', 'content': prompt}],
	'stream': False,
	'think': False,
	'options': {'temperature': 0.1, 'num_predict': 100}
	}, timeout=30)
	if resp.status_code == 200:
	content = resp.json().get('message', {}).get('content', '')
	content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
	json_match = re.search(r'\{[^}]+\}', content)
	if json_match:
	data = json.loads(json_match.group())
	return [
	float(data.get('promotional', 0.5)),
	float(data.get('transactional', 0.5)),
	float(data.get('personal', 0.5)),
	float(data.get('phishing', 0.5)),
	float(data.get('urgency', 0.5)),
	float(data.get('formality', 0.5)),
	]
	except Exception:
	pass
	return [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]


	# I use a hash of each email text as a cache key so I don't have to re-run
	# the LLM for emails I've already processed (the LLM calls are really slow)
	def get_text_hash(text):
	"""Get a stable hash for an email text."""
	return hashlib.sha256(text.encode('utf-8', errors='replace')).hexdigest()[:16]


	CACHE_PATH = data_dir / 'llm_features_cache.csv'


	def get_or_compute_llm_features(texts, model='qwen3.5:2b'):
	"""Load cached LLM features, compute missing ones, return combined array."""
	hashes = [get_text_hash(t) for t in texts]

	# Load existing cache
	cached = {}
	if CACHE_PATH.exists():
	cache_df = pd.read_csv(CACHE_PATH, index_col='hash')
	for h in cache_df.index:
	cached[h] = cache_df.loc[h, LLM_FEATURE_NAMES].values.tolist()
	print(' Loaded %d cached LLM features' % len(cached))

	# Identify uncached
	uncached_indices = [i for i, h in enumerate(hashes) if h not in cached]
	print(' Need to compute %d new LLM features' % len(uncached_indices))

	if uncached_indices:
	new_entries = []
	for i in tqdm(uncached_indices, desc=' Extracting LLM features'):
	feats = extract_llm_features_single(texts[i], model=model)
	h = hashes[i]
	cached[h] = feats
	new_entries.append({'hash': h, **dict(zip(LLM_FEATURE_NAMES, feats))})

	# Append to cache file
	new_df = pd.DataFrame(new_entries).set_index('hash')
	if CACHE_PATH.exists():
	existing = pd.read_csv(CACHE_PATH, index_col='hash')
	combined = pd.concat([existing, new_df])
	combined = combined[~combined.index.duplicated(keep='last')]
	combined.to_csv(CACHE_PATH)
	else:
	new_df.to_csv(CACHE_PATH)
	print(' Saved %d new features to cache' % len(new_entries))

	# Build output array in order
	result = np.array([cached[h] for h in hashes])
	return result


	print('Starting model training...')

	df = pd.DataFrame(columns=['text', 'label'])

	# Enron corpus — gold standard real corporate email (~33k)
	if enron_csv.exists():
	print('Loading Enron email dataset...')
	df_enron = pd.read_csv(enron_csv)
	df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'})
	df_enron['label'] = df_enron['label'].str.strip().str.lower()
	df_enron = df_enron[['text', 'label']].dropna(subset=['text', 'label'])
	df_enron = df_enron[df_enron['label'].isin(['spam', 'ham'])]
	print(' Enron: %d emails (%d ham, %d spam)' % (
	len(df_enron),
	len(df_enron[df_enron['label'] == 'ham']),
	len(df_enron[df_enron['label'] == 'spam'])
	))
	df = pd.concat([df, df_enron], ignore_index=True)
	else:
	print('WARNING: Enron CSV not found at %s' % str(enron_csv))

	# puyang2025 — 7 research corpora (TREC-05/06/07, CEAS-08, SpamAssassin, Ling-Spam)
	# Enron sub-corpus excluded to avoid duplicating emails already loaded above.
	if puyang_parquet.exists():
	print('Loading puyang2025 seven-phishing-email-datasets...')
	df_puyang = pd.read_parquet(puyang_parquet)
	df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron']
	df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'})
	df_puyang = df_puyang[['text', 'label']].dropna(subset=['text', 'label'])
	print(' puyang2025 (Enron excluded): %d emails (%d ham, %d spam)' % (
	len(df_puyang),
	len(df_puyang[df_puyang['label'] == 'ham']),
	len(df_puyang[df_puyang['label'] == 'spam'])
	))
	df = pd.concat([df, df_puyang], ignore_index=True)
	else:
	print('WARNING: puyang2025 parquet not found at %s' % str(puyang_parquet))

	# zefang phishing dataset — 18k emails labeled ham vs phishing
	# Phishing is treated as the positive (spam) class for binary classification.
	if zefang_parquet.exists():
	print('Loading zefang phishing dataset...')
	df_zefang = pd.read_parquet(zefang_parquet)
	df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'})
	df_zefang = df_zefang[['text', 'label']].dropna(subset=['text', 'label'])
	print(' zefang phishing: %d emails (%d ham, %d phishing->spam)' % (
	len(df_zefang),
	len(df_zefang[df_zefang['label'] == 'ham']),
	len(df_zefang[df_zefang['label'] == 'spam'])
	))
	df = pd.concat([df, df_zefang], ignore_index=True)
	else:
	print('WARNING: zefang parquet not found at %s' % str(zefang_parquet))

	if len(df) == 0:
	raise RuntimeError('No training data found. Run the data download step first.')

	print('Combined dataset: %d emails' % len(df))

	# Load user feedback corrections (if any)
	feedback_file = project_dir / 'data' / 'feedback' / 'feedback_log.csv'
	if feedback_file.exists():
	print('Loading user feedback corrections...')
	df_feedback = pd.read_csv(feedback_file)
	if 'email_text' in df_feedback.columns and 'correct_label' in df_feedback.columns:
	df_feedback_clean = df_feedback[['email_text', 'correct_label']].dropna()
	df_feedback_clean = df_feedback_clean.rename(columns={'email_text': 'text', 'correct_label': 'label'})
	# Weight corrections 5x to amplify their impact
	df_feedback_weighted = pd.concat([df_feedback_clean] * 5, ignore_index=True)
	df = pd.concat([df, df_feedback_weighted], ignore_index=True)
	print(' Feedback: %d corrections (5x weighted = %d rows)' % (len(df_feedback_clean), len(df_feedback_weighted)))

	# Deduplicate
	before = len(df)
	df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
	print('Total after dedup: %d emails (removed %d duplicates)' % (len(df), before - len(df)))

	print('Preprocessing text...')
	df['clean_text'] = df['text'].apply(preprocess_text)

	# Build expanded TF-IDF features + metadata features + optional LLM features
	print('Building features (3000 TF-IDF + 24 metadata + optional LLM)...')
	tfidf = TfidfVectorizer(
	max_features=3000,
	ngram_range=(1, 3),
	min_df=2,
	max_df=0.90,
	sublinear_tf=True
	)
	X_tfidf = tfidf.fit_transform(df['clean_text'])

	X_meta = compute_metadata_features(df['text'].values)

	# Normalize metadata features to 0-1 range so they match the TF-IDF scale
	# Without this, features like email_length (could be 1000+) would dominate
	# over binary features like has_unsubscribe (just 0 or 1)
	meta_scaler = MinMaxScaler()
	X_meta_scaled = meta_scaler.fit_transform(X_meta)

	meta_feature_names = ['exclamation_density', 'dollar_sign_count', 'caps_word_ratio',
	'spam_phrase_count', 'ham_phrase_count', 'net_spam_context',
	'url_count', 'html_tag_count', 'email_length',
	'avg_sentence_length', 'capitalization_ratio',
	'has_specific_date', 'has_specific_time', 'date_reference_count',
	'has_unsubscribe', 'has_physical_address', 'has_proper_greeting',
	'has_contact_info', 'registration_language_score',
	'cta_to_info_ratio', 'shortener_url_ratio',
	'legitimate_platform_count', 'gov_edu_url_count',
	'question_mark_count']

	# Check if Ollama is available for LLM feature extraction
	# SKIP_LLM_TRAINING flag skips the slow LLM calls during training since
	# TF-IDF + metadata features are already enough for good accuracy
	if SKIP_LLM_TRAINING:
	print('SKIP_LLM_TRAINING=True - skipping LLM features for faster training')
	ollama_available = False
	else:
	ollama_available = check_ollama_available()
	if ollama_available:
	print('Ollama available - extracting LLM intent/tone features...')
	X_llm = get_or_compute_llm_features(df['text'].values)
	# Combine all feature matrices - I use hstack from scipy.sparse because
	# TF-IDF is already sparse and converting to dense would use too much memory
	X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled), csr_matrix(X_llm)])
	feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names + LLM_FEATURE_NAMES
	else:
	print('Ollama not available - skipping LLM features')
	X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)])
	feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names
	y = (df['label'] == 'spam').astype(int)

	n_llm = len(LLM_FEATURE_NAMES) if ollama_available else 0
	print('Total features: %d (%d TF-IDF + %d metadata + %d LLM)' % (len(feature_names), X_tfidf.shape[1], len(meta_feature_names), n_llm))

	# Split into train/test
	X_train, X_test, y_train, y_test = train_test_split(
	X_combined, y, test_size=0.3, random_state=random_state, stratify=y
	)

	print('Running GridSearchCV...')
	param_grid = {
	'n_estimators': [100, 200],
	'max_depth': [20, None],
	}
	# Use class_weight balanced to handle class imbalance
	# Reduced grid from 8 combos to 4, and CV from 5 to 3 folds for faster training
	rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced')
	grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
	grid_search.fit(X_train, y_train)

	best_rf = grid_search.best_estimator_
	print('Best params: %s' % str(grid_search.best_params_))
	print('Best CV F1: %.4f' % grid_search.best_score_)

	# Calibrate probabilities so the confidence percentages are more meaningful
	# I found this technique on the sklearn docs - without it the RF probabilities
	# can be overconfident, which makes the threshold slider in the app less useful
	print('Calibrating probabilities with isotonic regression...')
	calibrated_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5)
	calibrated_rf.fit(X_train, y_train)

	# Compute optimal threshold targeting 99% ham precision on test set
	y_test_proba = calibrated_rf.predict_proba(X_test)[:, 1]
	precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba)
	# Find the highest threshold where items classified as ham still have >= 99% precision
	# Higher threshold = more emails classified as ham, so we want as high as possible
	# while keeping ham predictions accurate
	best_threshold = 0.50
	for t in sorted(thresholds_pr, reverse=True):
	predicted_ham_mask = y_test_proba < t
	if predicted_ham_mask.sum() == 0:
	continue
	ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum()
	if ham_precision >= 0.99:
	best_threshold = t
	break
	optimal_threshold = best_threshold

	print('Optimal threshold (99%% ham precision): %.4f' % optimal_threshold)

	y_pred = calibrated_rf.predict(X_test)
	y_pred_optimal = (y_test_proba >= optimal_threshold).astype(int)

	print('\nTest Set Performance (default 0.5 threshold):')
	print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

	print('Test Set Performance (optimal %.2f threshold):' % optimal_threshold)
	print(classification_report(y_test, y_pred_optimal, target_names=['Ham', 'Spam']))

	# Save model artifacts
	print('Saving model files...')
	models_dir.mkdir(exist_ok=True)
	joblib.dump(calibrated_rf, models_dir / 'random_forest_spam.joblib')
	joblib.dump(best_rf, models_dir / 'random_forest_raw.joblib')
	joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib')
	joblib.dump(feature_names, models_dir / 'feature_names.joblib')
	joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib')
	joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib')

	# Save training sample for LIME (200 random rows from training set)
	X_train_dense = X_train.toarray()
	rng = np.random.RandomState(random_state)
	sample_idx = rng.choice(X_train_dense.shape[0], size=min(200, X_train_dense.shape[0]), replace=False)
	training_sample = X_train_dense[sample_idx]
	joblib.dump(training_sample, models_dir / 'training_sample.joblib')

	# Save training config (tracks whether LLM features were used)
	joblib.dump({
	'llm_features_used': ollama_available,
	'llm_feature_names': LLM_FEATURE_NAMES if ollama_available else [],
	'model_used': 'gemma3:1b',
	}, models_dir / 'training_config.joblib')

	print('\nSaved: calibrated model, vectorizer, feature_names, optimal_threshold (%.4f), training_sample %s, training_config to models/' % (optimal_threshold, str(training_sample.shape)))
	print('All done!')