# Quick accuracy test for the retrained spam classifier import joblib, numpy as np, re from pathlib import Path from nltk.corpus import stopwords from nltk.stem import PorterStemmer from scipy.sparse import hstack, csr_matrix models_dir = Path(__file__).parent / 'models' model = joblib.load(models_dir / 'random_forest_spam.joblib') vectorizer = joblib.load(models_dir / 'tfidf_vectorizer.joblib') threshold = joblib.load(models_dir / 'optimal_threshold.joblib') stemmer = PorterStemmer() stop_words = set(stopwords.words('english')) spam_context_phrases = ['act now','limited time','click to claim','you have won','wire transfer','bank account','million dollar','free gift','no prescription','buy now','make money fast','lose weight','casino','free credit'] ham_context_phrases = ['click to unsubscribe','unsubscribe from','to opt out','this newsletter','you are receiving this','official notice','department of','office of','subscribe to updates','manage your subscription','privacy policy','government website','register now','sign up','reserve your spot','rsvp','event details','schedule','agenda','venue','annual','edition','season','community'] registration_phrases = ['register now','sign up','reserve your spot','rsvp','registration open','tickets available','limited capacity','early bird','reserve your seat','join us'] url_shorteners = ['bit.ly','tinyurl.com','goo.gl','t.co','ow.ly','is.gd','buff.ly','adf.ly','shorte.st'] legitimate_platforms = ['eventbrite.com','meetup.com','mailchimp.com','constantcontact.com','surveymonkey.com','google.com','zoom.us','microsoft.com','linkedin.com','facebook.com','github.com','youtube.com'] def preprocess_text(text): text = re.sub(r'<[^>]+>', ' ', text) text = re.sub(r'https?://\S+|www\.\S+', ' ', text) text = re.sub(r'\S+@\S+', ' ', text) text = re.sub(r'[^a-zA-Z\s]', ' ', text) text = text.lower() tokens = text.split() return ' '.join([stemmer.stem(w) for w in tokens if w not in stop_words and len(w) > 2]) def compute_meta(text): sentences = re.split(r'[.!?]+', text) sc = max(len([s for s in sentences if s.strip()]), 1) words = text.split() text_lower = text.lower() alpha = [c for c in text if c.isalpha()] urls = re.findall(r'https?://([^\s/]+)', text) months = ['january','february','march','april','may','june','july','august','september','october','november','december'] date_pats = [ r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b', r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}', r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', r'\b\d{4}-\d{2}-\d{2}\b' ] cta_words = ['buy','order','click','act','hurry','rush','grab','claim'] info_words = ['schedule','agenda','details','information','about','learn','location','venue','date','time','address','contact'] return np.array([[ text.count('!') / sc, text.count('$'), len([w for w in words if w.isupper() and len(w) > 1]) / max(len(words), 1), sum(1 for p in spam_context_phrases if p in text_lower), sum(1 for p in ham_context_phrases if p in text_lower), sum(1 for p in spam_context_phrases if p in text_lower) - sum(1 for p in ham_context_phrases if p in text_lower), len(re.findall(r'https?://\S+|www\.\S+', text)), len(re.findall(r'<[^>]+>', text)), len(text), len(text) / sc, sum(1 for c in alpha if c.isupper()) / max(len(alpha), 1), 1 if any(re.search(p, text, re.IGNORECASE) for p in date_pats) else 0, 1 if re.search(r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm)\b', text) else 0, sum(len(re.findall(r'\b' + m + r'\b', text_lower)) for m in months), 1 if re.search(r'unsubscribe|opt.out', text_lower) else 0, 1 if re.search(r'\d+\s+\w+\s+(?:St|Street|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Rd|Road|Ln|Lane|Way|Ct|Court)\b', text, re.IGNORECASE) else 0, 1 if re.search(r'^(?:Dear|Hello|Hi|Good morning|Good afternoon)\s+\w', text.strip(), re.IGNORECASE) else 0, 1 if (re.search(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text) or re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)) else 0, sum(1 for p in registration_phrases if p in text_lower), sum(text_lower.split().count(w) for w in cta_words) / max(sum(text_lower.split().count(w) for w in info_words), 1), sum(1 for u in urls if any(s in u.lower() for s in url_shorteners)) / max(len(urls), 1), sum(1 for u in urls if any(p in u.lower() for p in legitimate_platforms)), sum(1 for u in urls if u.lower().endswith('.gov') or u.lower().endswith('.edu')), text.count('?') ]]) def classify(text): clean = preprocess_text(text) X = hstack([vectorizer.transform([clean]), csr_matrix(compute_meta(text))]).toarray() proba = model.predict_proba(X)[0] label = 'SPAM' if proba[1] >= threshold else 'HAM' return label, proba[1] * 100, proba[0] * 100 # ---- Test Suite ---- tests = [ # (name, is_spam, email_text) # === SPAM === ('Nigerian Prince Scam', True, "Subject: URGENT - You Have Won $5,000,000!!!\n\nDear Friend,\nCONGRATULATIONS!!! You have been selected as the winner of our international lottery program!!!\nTo claim your $5,000,000 USD prize, click the link below IMMEDIATELY and provide your bank details.\nACT NOW - This offer expires in 24 hours!!!\nClick here: http://totally-legit-prize.com/claim\nSend $500 processing fee to unlock your winnings.\nBest regards, Dr. Prince Mohammed"), ('Viagra/Pharma Spam', True, "Subject: Best prices on V1AGRA and C1ALIS!!!\n\n$$$ SAVE BIG $$$\nBuy now and get 80% OFF!!!\nNo prescription needed! Free shipping!\nOrder at http://cheap-pharma-deals.com\nLIMITED TIME OFFER - ACT NOW!\nSubscribe to our mailing list for more deals!"), ('Phishing - Account Suspended', True, "Subject: Your account has been suspended - Action Required IMMEDIATELY\n\nDear Customer,\nWe have detected unusual activity on your account. Your account will be permanently DELETED unless you verify your identity within 24 hours.\nClick here to verify: http://secure-bank-verify.com/login\nYou must provide your Social Security Number and bank account details.\nACT NOW or lose access forever!!!"), ('Weight Loss Spam', True, "Subject: Lose 30 Pounds in 30 Days - GUARANTEED!!!\n\nAmazing new weight loss pill that doctors dont want you to know about!\nBuy now and get FREE shipping! No prescription needed!\nMake money fast by selling to your friends!\nOrder at http://miracle-diet-pills.com\nLimited time offer - act now!!!"), ('Casino Spam', True, "Subject: WIN BIG at our online casino!!!\n\n$$$ FREE $500 BONUS just for signing up! $$$\nNo deposit required! Start winning TODAY!\nClick here: http://best-online-casino-deals.com\nMake money fast playing from home!\nWire transfer your winnings directly to your bank account!"), # === HAM === ('KRT QRT Cycling Event', False, "Subject: KRT QRT 6th Annual Royal Weekend - Virginia Beach Edition\n\nGET YOUR TICKETS NOW!\nJoin us for the 6th Annual Royal Weekend in Virginia Beach!\nSaturday, March 22, 2026 - Virginia Beach Convention Center\nSCHEDULE:\n- 10:00 AM: Registration & Check-In\n- 11:00 AM: Group Ride (30 miles)\n- 2:00 PM: Cookout & Awards\n- 5:00 PM: After Party\nEarly Bird Tickets: $35 (before March 10)\nRegular Tickets: $45\nRegister now at https://www.eventbrite.com/krt-royal-weekend-2026\nFollow us on Instagram @KRT_QRT\nQuestions? Email krtqrt@gmail.com or call (757) 555-0123\nSee you there!\nKRT QRT Cycling Club, Virginia Beach, VA"), ('Team Meeting Invite', False, "Subject: Team sync Thursday 2pm\n\nHi everyone,\nJust a reminder that we have our weekly team sync this Thursday at 2pm in Conference Room B.\nAgenda:\n- Sprint review\n- Q2 planning discussion\n- New hire onboarding update\nPlease come prepared with your status updates.\nThanks, Sarah"), ('Tech Discussion', False, "Subject: Re: Python 3.12 upgrade\n\nHey Mike,\nI tested the upgrade on our staging environment yesterday. Everything looks good except for one deprecation warning in the logging module.\nThe new pattern matching syntax is really nice for our parser module. Want to pair on refactoring that section next week?\nCheers, Dave"), ('Family Email', False, "Subject: Thanksgiving dinner plans\n\nHi everyone!\nHope youre all doing well. I wanted to start planning for Thanksgiving dinner this year.\nMom and Dad said they can host again. I was thinking we could do a potluck style.\nCan everyone reply with what theyd like to bring?\nLove, Jenny"), ('University Newsletter', False, "Subject: ODU Weekly Digest - March 2026\n\nOld Dominion University - Campus Events & News\nDear Students,\nSpring Break is almost here! Here is your weekly update:\nUPCOMING EVENTS:\n- Tuesday, March 10: Career Fair at Webb Center, 10:00 AM - 3:00 PM\n- Thursday, March 12: Guest Lecture on AI Ethics, 7:00 PM\n- Friday, March 13: Spring Concert tickets available - register now at https://www.odu.edu/events\nContact the Office of Student Engagement at engage@odu.edu or (757) 683-3468\n123 Student Success Center, Norfolk, VA 23529\nTo unsubscribe from this newsletter, click here."), ('Charity 5K Registration', False, "Subject: Register Now - Hampton Roads Heart Walk 5K\n\nDear Community Members,\nJoin us for the annual Hampton Roads Heart Walk 5K!\nDate: Saturday, April 5, 2026\nTime: 8:00 AM check-in, 9:00 AM start\nLocation: Town Point Park, 113 Waterside Dr, Norfolk, VA 23510\nRegistration: $25 adults, $10 students, free for children under 12\nSign up at https://www.eventbrite.com/heart-walk-hampton-roads\nAll proceeds benefit the American Heart Association.\nQuestions? Contact Sarah Johnson at sarah@heartwalknorfolk.org or (757) 555-0456\nTo opt out of future emails, click unsubscribe."), ('Book Club Announcement', False, "Subject: March Book Club Selection - The Great Gatsby Discussion\n\nHello Book Lovers,\nOur March selection is The Great Gatsby by F. Scott Fitzgerald.\nMeeting Details:\n- Date: Wednesday, March 19, 2026\n- Time: 7:00 PM\n- Location: Norfolk Public Library, 235 E Plume St, Norfolk, VA 23510\nDiscussion questions will be emailed next week. New members welcome!\nRSVP to bookclub@norfolklibrary.org\nSee you there!\nNorfolk Community Book Club"), ('Gym Class Schedule', False, "Subject: Spring 2026 Group Fitness Schedule - Norfolk YMCA\n\nHi Members,\nOur new spring schedule starts Monday, March 3!\nHIGHLIGHTS:\n- NEW: Saturday 9:00 AM Spin Class with Coach Rivera\n- Yoga: Mon/Wed/Fri at 6:00 AM and 5:30 PM\n- CrossFit: Tue/Thu at 6:30 AM\nEarly bird registration for summer camps opens March 15!\nReserve your spot at https://www.ymca.org/norfolk/classes\nNorfolk YMCA, 312 W Bute St, Norfolk, VA 23510\nPhone: (757) 622-6328\nTo manage your subscription preferences, visit your account settings."), ('Neighborhood HOA Newsletter', False, "Subject: Willowbrook HOA March Newsletter\n\nDear Residents,\nSpring is here! Here are your community updates for March:\nCOMMUNITY EVENTS:\n- Annual Spring Clean-Up: Saturday, March 8, 9:00 AM at the clubhouse\n- Easter Egg Hunt: Saturday, March 29, 10:00 AM at Willowbrook Park\n- Board Meeting: Tuesday, March 18, 7:00 PM\nREMINDERS:\n- Lawn maintenance season begins April 1\n- Pool opens May 24 - season passes available now\nQuestions? Contact the HOA office at hoa@willowbrookva.org or (757) 555-0789\nWillowbrook HOA, 456 Willowbrook Dr, Virginia Beach, VA 23456\nYou are receiving this because you are a Willowbrook resident."), ] print('=' * 80) print('SPAM CLASSIFIER ACCURACY TEST') print('Model: Random Forest + 3000 TF-IDF + 24 metadata features') print('Threshold: %.4f' % threshold) print('=' * 80) print() print('%-35s %-8s %-8s %7s %s' % ('Test Case', 'Expected', 'Got', 'Spam%', 'Result')) print('-' * 80) correct = 0 for name, is_spam, text in tests: label, spam_pct, ham_pct = classify(text) expected = 'SPAM' if is_spam else 'HAM' match = label == expected correct += match status = 'PASS' if match else '** FAIL **' print('%-35s %-8s %-8s %6.1f%% %s' % (name, expected, label, spam_pct, status)) print('-' * 80) print('Overall: %d/%d (%.0f%%)' % (correct, len(tests), correct / len(tests) * 100)) print() spam_tests = [(n, s, t) for n, s, t in tests if s] ham_tests = [(n, s, t) for n, s, t in tests if not s] spam_ok = sum(1 for n, s, t in spam_tests if classify(t)[0] == 'SPAM') ham_ok = sum(1 for n, s, t in ham_tests if classify(t)[0] == 'HAM') print('Spam detection: %d/%d' % (spam_ok, len(spam_tests))) print('Ham detection: %d/%d' % (ham_ok, len(ham_tests)))