|
|
|
|
|
|
|
|
| import re
|
| import warnings
|
| import numpy as np
|
| import pandas as pd
|
| from pathlib import Path
|
| from sklearn.model_selection import train_test_split, GridSearchCV
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.ensemble import RandomForestClassifier
|
|
|
|
|
| from sklearn.calibration import CalibratedClassifierCV
|
| from sklearn.metrics import classification_report, f1_score, precision_recall_curve
|
|
|
|
|
| from sklearn.preprocessing import MinMaxScaler
|
|
|
|
|
| from scipy.sparse import hstack, csr_matrix
|
| import joblib
|
| import json
|
| import hashlib
|
| import requests
|
| from tqdm import tqdm
|
| from utils_student import (preprocess_text, compute_metadata_features,
|
| spam_context_phrases, ham_context_phrases,
|
| registration_phrases, url_shorteners,
|
| legitimate_platforms, OLLAMA_API, LLM_FEATURE_NAMES)
|
|
|
| warnings.filterwarnings('ignore', category=FutureWarning)
|
| warnings.filterwarnings('ignore', category=DeprecationWarning)
|
|
|
|
|
| project_dir = Path(__file__).parent
|
| data_dir = project_dir / 'data' / 'processed'
|
| raw_dir = project_dir / 'data' / 'raw'
|
| models_dir = project_dir / 'models'
|
| random_state = 42
|
|
|
| enron_csv = project_dir / 'data' / 'raw' / 'enron' / 'enron_spam_data.csv'
|
| puyang_parquet = project_dir / 'data' / 'raw' / 'puyang2025' / 'seven_phishing_emails.parquet'
|
| zefang_parquet = project_dir / 'data' / 'raw' / 'zefang' / 'phishing_emails.parquet'
|
| SKIP_LLM_TRAINING = True
|
|
|
| def check_ollama_available(model='qwen3.5:2b'):
|
| """Check if Ollama is running and has the specified model."""
|
| try:
|
| resp = requests.get('http://localhost:11434/api/tags', timeout=2)
|
| if resp.status_code == 200:
|
| models = [m['name'] for m in resp.json().get('models', [])]
|
| return any(model in m for m in models)
|
| except Exception:
|
| pass
|
| return False
|
|
|
|
|
| def extract_llm_features_single(text, model='qwen3.5:2b'):
|
| """Extract intent and tone features for a single email via Ollama."""
|
| truncated = text[:500]
|
| prompt = (
|
| 'Rate this email on these dimensions (0.0 to 1.0).\n'
|
| 'Respond with ONLY valid JSON: {"promotional": X, "transactional": X, '
|
| '"personal": X, "phishing": X, "urgency": X, "formality": X}\n'
|
| '/no_think\n\n'
|
| 'Email: "%s"' % truncated
|
| )
|
| try:
|
| resp = requests.post(OLLAMA_API, json={
|
| 'model': model,
|
| 'messages': [{'role': 'user', 'content': prompt}],
|
| 'stream': False,
|
| 'think': False,
|
| 'options': {'temperature': 0.1, 'num_predict': 100}
|
| }, timeout=30)
|
| if resp.status_code == 200:
|
| content = resp.json().get('message', {}).get('content', '')
|
| content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| json_match = re.search(r'\{[^}]+\}', content)
|
| if json_match:
|
| data = json.loads(json_match.group())
|
| return [
|
| float(data.get('promotional', 0.5)),
|
| float(data.get('transactional', 0.5)),
|
| float(data.get('personal', 0.5)),
|
| float(data.get('phishing', 0.5)),
|
| float(data.get('urgency', 0.5)),
|
| float(data.get('formality', 0.5)),
|
| ]
|
| except Exception:
|
| pass
|
| return [0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
|
|
|
|
|
|
|
|
|
| def get_text_hash(text):
|
| """Get a stable hash for an email text."""
|
| return hashlib.sha256(text.encode('utf-8', errors='replace')).hexdigest()[:16]
|
|
|
|
|
| CACHE_PATH = data_dir / 'llm_features_cache.csv'
|
|
|
|
|
| def get_or_compute_llm_features(texts, model='qwen3.5:2b'):
|
| """Load cached LLM features, compute missing ones, return combined array."""
|
| hashes = [get_text_hash(t) for t in texts]
|
|
|
|
|
| cached = {}
|
| if CACHE_PATH.exists():
|
| cache_df = pd.read_csv(CACHE_PATH, index_col='hash')
|
| for h in cache_df.index:
|
| cached[h] = cache_df.loc[h, LLM_FEATURE_NAMES].values.tolist()
|
| print(' Loaded %d cached LLM features' % len(cached))
|
|
|
|
|
| uncached_indices = [i for i, h in enumerate(hashes) if h not in cached]
|
| print(' Need to compute %d new LLM features' % len(uncached_indices))
|
|
|
| if uncached_indices:
|
| new_entries = []
|
| for i in tqdm(uncached_indices, desc=' Extracting LLM features'):
|
| feats = extract_llm_features_single(texts[i], model=model)
|
| h = hashes[i]
|
| cached[h] = feats
|
| new_entries.append({'hash': h, **dict(zip(LLM_FEATURE_NAMES, feats))})
|
|
|
|
|
| new_df = pd.DataFrame(new_entries).set_index('hash')
|
| if CACHE_PATH.exists():
|
| existing = pd.read_csv(CACHE_PATH, index_col='hash')
|
| combined = pd.concat([existing, new_df])
|
| combined = combined[~combined.index.duplicated(keep='last')]
|
| combined.to_csv(CACHE_PATH)
|
| else:
|
| new_df.to_csv(CACHE_PATH)
|
| print(' Saved %d new features to cache' % len(new_entries))
|
|
|
|
|
| result = np.array([cached[h] for h in hashes])
|
| return result
|
|
|
|
|
| print('Starting model training...')
|
|
|
| df = pd.DataFrame(columns=['text', 'label'])
|
|
|
|
|
| if enron_csv.exists():
|
| print('Loading Enron email dataset...')
|
| df_enron = pd.read_csv(enron_csv)
|
| df_enron = df_enron.rename(columns={'Message': 'text', 'Spam/Ham': 'label'})
|
| df_enron['label'] = df_enron['label'].str.strip().str.lower()
|
| df_enron = df_enron[['text', 'label']].dropna(subset=['text', 'label'])
|
| df_enron = df_enron[df_enron['label'].isin(['spam', 'ham'])]
|
| print(' Enron: %d emails (%d ham, %d spam)' % (
|
| len(df_enron),
|
| len(df_enron[df_enron['label'] == 'ham']),
|
| len(df_enron[df_enron['label'] == 'spam'])
|
| ))
|
| df = pd.concat([df, df_enron], ignore_index=True)
|
| else:
|
| print('WARNING: Enron CSV not found at %s' % str(enron_csv))
|
|
|
|
|
|
|
| if puyang_parquet.exists():
|
| print('Loading puyang2025 seven-phishing-email-datasets...')
|
| df_puyang = pd.read_parquet(puyang_parquet)
|
| df_puyang = df_puyang[df_puyang['dataset_name'] != 'Enron']
|
| df_puyang['label'] = df_puyang['label'].map({0: 'ham', 1: 'spam'})
|
| df_puyang = df_puyang[['text', 'label']].dropna(subset=['text', 'label'])
|
| print(' puyang2025 (Enron excluded): %d emails (%d ham, %d spam)' % (
|
| len(df_puyang),
|
| len(df_puyang[df_puyang['label'] == 'ham']),
|
| len(df_puyang[df_puyang['label'] == 'spam'])
|
| ))
|
| df = pd.concat([df, df_puyang], ignore_index=True)
|
| else:
|
| print('WARNING: puyang2025 parquet not found at %s' % str(puyang_parquet))
|
|
|
|
|
|
|
| if zefang_parquet.exists():
|
| print('Loading zefang phishing dataset...')
|
| df_zefang = pd.read_parquet(zefang_parquet)
|
| df_zefang['label'] = df_zefang['label'].map({'ham': 'ham', 'phishing': 'spam'})
|
| df_zefang = df_zefang[['text', 'label']].dropna(subset=['text', 'label'])
|
| print(' zefang phishing: %d emails (%d ham, %d phishing->spam)' % (
|
| len(df_zefang),
|
| len(df_zefang[df_zefang['label'] == 'ham']),
|
| len(df_zefang[df_zefang['label'] == 'spam'])
|
| ))
|
| df = pd.concat([df, df_zefang], ignore_index=True)
|
| else:
|
| print('WARNING: zefang parquet not found at %s' % str(zefang_parquet))
|
|
|
| if len(df) == 0:
|
| raise RuntimeError('No training data found. Run the data download step first.')
|
|
|
| print('Combined dataset: %d emails' % len(df))
|
|
|
|
|
| feedback_file = project_dir / 'data' / 'feedback' / 'feedback_log.csv'
|
| if feedback_file.exists():
|
| print('Loading user feedback corrections...')
|
| df_feedback = pd.read_csv(feedback_file)
|
| if 'email_text' in df_feedback.columns and 'correct_label' in df_feedback.columns:
|
| df_feedback_clean = df_feedback[['email_text', 'correct_label']].dropna()
|
| df_feedback_clean = df_feedback_clean.rename(columns={'email_text': 'text', 'correct_label': 'label'})
|
|
|
| df_feedback_weighted = pd.concat([df_feedback_clean] * 5, ignore_index=True)
|
| df = pd.concat([df, df_feedback_weighted], ignore_index=True)
|
| print(' Feedback: %d corrections (5x weighted = %d rows)' % (len(df_feedback_clean), len(df_feedback_weighted)))
|
|
|
|
|
| before = len(df)
|
| df = df.drop_duplicates(subset=['text']).reset_index(drop=True)
|
| print('Total after dedup: %d emails (removed %d duplicates)' % (len(df), before - len(df)))
|
|
|
| print('Preprocessing text...')
|
| df['clean_text'] = df['text'].apply(preprocess_text)
|
|
|
|
|
| print('Building features (3000 TF-IDF + 24 metadata + optional LLM)...')
|
| tfidf = TfidfVectorizer(
|
| max_features=3000,
|
| ngram_range=(1, 3),
|
| min_df=2,
|
| max_df=0.90,
|
| sublinear_tf=True
|
| )
|
| X_tfidf = tfidf.fit_transform(df['clean_text'])
|
|
|
| X_meta = compute_metadata_features(df['text'].values)
|
|
|
|
|
|
|
|
|
| meta_scaler = MinMaxScaler()
|
| X_meta_scaled = meta_scaler.fit_transform(X_meta)
|
|
|
| meta_feature_names = ['exclamation_density', 'dollar_sign_count', 'caps_word_ratio',
|
| 'spam_phrase_count', 'ham_phrase_count', 'net_spam_context',
|
| 'url_count', 'html_tag_count', 'email_length',
|
| 'avg_sentence_length', 'capitalization_ratio',
|
| 'has_specific_date', 'has_specific_time', 'date_reference_count',
|
| 'has_unsubscribe', 'has_physical_address', 'has_proper_greeting',
|
| 'has_contact_info', 'registration_language_score',
|
| 'cta_to_info_ratio', 'shortener_url_ratio',
|
| 'legitimate_platform_count', 'gov_edu_url_count',
|
| 'question_mark_count']
|
|
|
|
|
|
|
|
|
| if SKIP_LLM_TRAINING:
|
| print('SKIP_LLM_TRAINING=True - skipping LLM features for faster training')
|
| ollama_available = False
|
| else:
|
| ollama_available = check_ollama_available()
|
| if ollama_available:
|
| print('Ollama available - extracting LLM intent/tone features...')
|
| X_llm = get_or_compute_llm_features(df['text'].values)
|
|
|
|
|
| X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled), csr_matrix(X_llm)])
|
| feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names + LLM_FEATURE_NAMES
|
| else:
|
| print('Ollama not available - skipping LLM features')
|
| X_combined = hstack([X_tfidf, csr_matrix(X_meta_scaled)])
|
| feature_names = list(tfidf.get_feature_names_out()) + meta_feature_names
|
| y = (df['label'] == 'spam').astype(int)
|
|
|
| n_llm = len(LLM_FEATURE_NAMES) if ollama_available else 0
|
| print('Total features: %d (%d TF-IDF + %d metadata + %d LLM)' % (len(feature_names), X_tfidf.shape[1], len(meta_feature_names), n_llm))
|
|
|
|
|
| X_train, X_test, y_train, y_test = train_test_split(
|
| X_combined, y, test_size=0.3, random_state=random_state, stratify=y
|
| )
|
|
|
| print('Running GridSearchCV...')
|
| param_grid = {
|
| 'n_estimators': [100, 200],
|
| 'max_depth': [20, None],
|
| }
|
|
|
|
|
| rf = RandomForestClassifier(random_state=random_state, n_jobs=-1, class_weight='balanced')
|
| grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=1)
|
| grid_search.fit(X_train, y_train)
|
|
|
| best_rf = grid_search.best_estimator_
|
| print('Best params: %s' % str(grid_search.best_params_))
|
| print('Best CV F1: %.4f' % grid_search.best_score_)
|
|
|
|
|
|
|
|
|
| print('Calibrating probabilities with isotonic regression...')
|
| calibrated_rf = CalibratedClassifierCV(best_rf, method='isotonic', cv=5)
|
| calibrated_rf.fit(X_train, y_train)
|
|
|
|
|
| y_test_proba = calibrated_rf.predict_proba(X_test)[:, 1]
|
| precision, recall, thresholds_pr = precision_recall_curve(y_test, y_test_proba)
|
|
|
|
|
|
|
| best_threshold = 0.50
|
| for t in sorted(thresholds_pr, reverse=True):
|
| predicted_ham_mask = y_test_proba < t
|
| if predicted_ham_mask.sum() == 0:
|
| continue
|
| ham_precision = (y_test.values[predicted_ham_mask] == 0).sum() / predicted_ham_mask.sum()
|
| if ham_precision >= 0.99:
|
| best_threshold = t
|
| break
|
| optimal_threshold = best_threshold
|
|
|
| print('Optimal threshold (99%% ham precision): %.4f' % optimal_threshold)
|
|
|
| y_pred = calibrated_rf.predict(X_test)
|
| y_pred_optimal = (y_test_proba >= optimal_threshold).astype(int)
|
|
|
| print('\nTest Set Performance (default 0.5 threshold):')
|
| print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
|
|
|
| print('Test Set Performance (optimal %.2f threshold):' % optimal_threshold)
|
| print(classification_report(y_test, y_pred_optimal, target_names=['Ham', 'Spam']))
|
|
|
|
|
| print('Saving model files...')
|
| models_dir.mkdir(exist_ok=True)
|
| joblib.dump(calibrated_rf, models_dir / 'random_forest_spam.joblib')
|
| joblib.dump(best_rf, models_dir / 'random_forest_raw.joblib')
|
| joblib.dump(tfidf, models_dir / 'tfidf_vectorizer.joblib')
|
| joblib.dump(feature_names, models_dir / 'feature_names.joblib')
|
| joblib.dump(optimal_threshold, models_dir / 'optimal_threshold.joblib')
|
| joblib.dump(meta_scaler, models_dir / 'meta_scaler.joblib')
|
|
|
|
|
| X_train_dense = X_train.toarray()
|
| rng = np.random.RandomState(random_state)
|
| sample_idx = rng.choice(X_train_dense.shape[0], size=min(200, X_train_dense.shape[0]), replace=False)
|
| training_sample = X_train_dense[sample_idx]
|
| joblib.dump(training_sample, models_dir / 'training_sample.joblib')
|
|
|
|
|
| joblib.dump({
|
| 'llm_features_used': ollama_available,
|
| 'llm_feature_names': LLM_FEATURE_NAMES if ollama_available else [],
|
| 'model_used': 'gemma3:1b',
|
| }, models_dir / 'training_config.joblib')
|
|
|
| print('\nSaved: calibrated model, vectorizer, feature_names, optimal_threshold (%.4f), training_sample %s, training_config to models/' % (optimal_threshold, str(training_sample.shape)))
|
| print('All done!')
|
|
|