Upload folder using huggingface_hub

960ec3d verified about 2 months ago

9.94 kB

	# Shared utilities for the spam classifier project
	# ENGT 375 Project - Spring 2026 - ODU
	# I put the shared functions here so I don't have to copy-paste them
	# between the training script and the Streamlit app

	import re
	import numpy as np
	from nltk.corpus import stopwords
	from nltk.stem import PorterStemmer

	# Ollama API endpoint
	OLLAMA_API = 'http://localhost:11434/api/chat'

	# LLM feature names (intent + tone dimensions)
	LLM_FEATURE_NAMES = ['intent_promotional', 'intent_transactional', 'intent_personal',
	'intent_phishing', 'tone_urgency', 'tone_formality']

	# I came up with these phrase lists by looking at common spam patterns
	# The idea is that certain phrases are strong signals for spam vs ham
	spam_context_phrases = [
	'act now', 'limited time', 'click to claim', 'you have won',
	'wire transfer', 'bank account', 'million dollar', 'free gift',
	'no prescription', 'buy now', 'make money fast', 'lose weight',
	'casino', 'free credit'
	]

	ham_context_phrases = [
	'click to unsubscribe', 'unsubscribe from', 'to opt out',
	'this newsletter', 'you are receiving this', 'official notice',
	'department of', 'office of', 'subscribe to updates',
	'manage your subscription', 'privacy policy', 'government website',
	'register now', 'sign up', 'reserve your spot', 'rsvp',
	'event details', 'schedule', 'agenda', 'venue',
	'annual', 'edition', 'season', 'community'
	]

	# Registration/event language (ham signal for community emails)
	registration_phrases = [
	'register now', 'sign up', 'reserve your spot', 'rsvp',
	'registration open', 'tickets available', 'limited capacity',
	'early bird', 'reserve your seat', 'join us'
	]

	# URL shortener domains (spam signal)
	url_shorteners = [
	'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly',
	'is.gd', 'buff.ly', 'adf.ly', 'shorte.st'
	]

	# Legitimate platform domains (ham signal)
	legitimate_platforms = [
	'eventbrite.com', 'meetup.com', 'mailchimp.com', 'constantcontact.com',
	'surveymonkey.com', 'google.com', 'zoom.us', 'microsoft.com',
	'linkedin.com', 'facebook.com', 'github.com', 'youtube.com',
	'steampowered.com', 'store.steampowered.com', 'paypal.com', 'stripe.com',
	'shopify.com', 'etsy.com', 'bestbuy.com', 'target.com',
	'amazon.com', 'netflix.com', 'spotify.com'
	]

	stemmer = PorterStemmer()
	stop_words = set(stopwords.words('english'))


	# Clean and stem the input text (Prof. Kuzlu showed us stemming in class)
	def preprocess_text(text):
	text = re.sub(r'<[^>]+>', ' ', text)
	text = re.sub(r'https?://\S+\|www\.\S+', ' ', text)
	text = re.sub(r'\S+@\S+', ' ', text)
	text = re.sub(r'[^a-zA-Z\s]', ' ', text)
	text = text.lower()
	tokens = text.split()
	result = []
	for w in tokens:
	if w not in stop_words and len(w) > 2:
	result.append(stemmer.stem(w))
	return ' '.join(result)


	# Compute 24 metadata features from a list of email texts
	# I designed these features based on what I noticed about spam vs ham emails -
	# things like exclamation marks, dollar signs, and ALL CAPS words
	def compute_metadata_features(texts):
	features = []
	for text in texts:
	# Sentence count (split on . ! ?)
	sentences = re.split(r'[.!?]+', text)
	sentence_count = max(len([s for s in sentences if s.strip()]), 1)

	# 1. exclamation_density (per sentence, not raw count)
	exclamation_density = text.count('!') / sentence_count

	# 2. dollar_sign_count
	dollar_count = text.count('$')

	# 3. caps_word_ratio (proportion of ALL-CAPS words)
	words = text.split()
	caps_words = []
	for w in words:
	if w.isupper() and len(w) > 1:
	caps_words.append(w)
	caps_word_ratio = len(caps_words) / max(len(words), 1)

	# 4. spam_phrase_count
	text_lower = text.lower()
	spam_phrase_count = 0
	for p in spam_context_phrases:
	if p in text_lower:
	spam_phrase_count = spam_phrase_count + 1

	# 5. ham_phrase_count
	ham_phrase_count = 0
	for p in ham_context_phrases:
	if p in text_lower:
	ham_phrase_count = ham_phrase_count + 1

	# 6. net_spam_context
	net_spam_context = spam_phrase_count - ham_phrase_count

	# 7. url_count
	url_count = len(re.findall(r'https?://\S+\|www\.\S+', text))

	# 8. html_tag_count
	html_tag_count = len(re.findall(r'<[^>]+>', text))

	# 9. email_length
	email_length = len(text)

	# 10. avg_sentence_length
	avg_sentence_length = len(text) / sentence_count

	# 11. capitalization_ratio (char-level)
	alpha_chars = []
	for c in text:
	if c.isalpha():
	alpha_chars.append(c)
	upper_count = 0
	for c in alpha_chars:
	if c.isupper():
	upper_count = upper_count + 1
	cap_ratio = upper_count / max(len(alpha_chars), 1)

	# --- NEW FEATURES (12-24) ---

	# 12. has_specific_date (day-of-week or month+day pattern)
	date_patterns = [
	r'\b(?:Monday\|Tuesday\|Wednesday\|Thursday\|Friday\|Saturday\|Sunday)\b',
	r'\b(?:January\|February\|March\|April\|May\|June\|July\|August\|September\|October\|November\|December)\s+\d{1,2}',
	r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
	r'\b\d{4}-\d{2}-\d{2}\b'
	]
	has_specific_date = 0
	for pat in date_patterns:
	if re.search(pat, text, re.IGNORECASE):
	has_specific_date = 1
	break

	# 13. has_specific_time (time with AM/PM)
	has_specific_time = 1 if re.search(r'\b\d{1,2}:\d{2}\s*(?:AM\|PM\|am\|pm\|a\.m\.\|p\.m\.)\b', text) else 0

	# 14. date_reference_count (count of month name references)
	months = ['january', 'february', 'march', 'april', 'may', 'june',
	'july', 'august', 'september', 'october', 'november', 'december']
	date_reference_count = 0
	for m in months:
	date_reference_count = date_reference_count + len(re.findall(r'\b' + m + r'\b', text_lower))

	# 15. has_unsubscribe
	has_unsubscribe = 1 if re.search(r'unsubscribe\|opt.out', text_lower) else 0

	# 16. has_physical_address (street address pattern)
	has_physical_address = 1 if re.search(r'\d+\s+\w+\s+(?:St\|Street\|Ave\|Avenue\|Blvd\|Boulevard\|Dr\|Drive\|Rd\|Road\|Ln\|Lane\|Way\|Ct\|Court)\b', text, re.IGNORECASE) else 0

	# 17. has_proper_greeting (starts with Dear/Hello/Hi + name)
	has_proper_greeting = 1 if re.search(r'^(?:Dear\|Hello\|Hi\|Good morning\|Good afternoon)\s+\w', text.strip(), re.IGNORECASE) else 0

	# 18. has_contact_info (phone number or email in body)
	has_phone = bool(re.search(r'$?\d{3}$?[-.\s]?\d{3}[-.\s]?\d{4}', text))
	has_email_addr = bool(re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text))
	has_contact_info = 1 if (has_phone or has_email_addr) else 0

	# 19. registration_language_score
	registration_language_score = 0
	for phrase in registration_phrases:
	if phrase in text_lower:
	registration_language_score = registration_language_score + 1

	# 20. cta_to_info_ratio (call-to-action words vs informational words)
	cta_words = ['buy', 'order', 'click', 'act', 'hurry', 'rush', 'grab', 'claim']
	info_words = ['schedule', 'agenda', 'details', 'information', 'about', 'learn',
	'location', 'venue', 'date', 'time', 'address', 'contact']
	cta_count = 0
	for w in cta_words:
	cta_count = cta_count + text_lower.split().count(w)
	info_count = 0
	for w in info_words:
	info_count = info_count + text_lower.split().count(w)
	cta_to_info_ratio = cta_count / max(info_count, 1)

	# 21. shortener_url_ratio (fraction of URLs using shorteners)
	urls = re.findall(r'https?://([^\s/]+)', text)
	shortener_count = 0
	for u in urls:
	for s in url_shorteners:
	if s in u.lower():
	shortener_count = shortener_count + 1
	break
	shortener_url_ratio = shortener_count / max(len(urls), 1)

	# 22. legitimate_platform_count (URLs from known platforms)
	legitimate_platform_count = 0
	for u in urls:
	for p in legitimate_platforms:
	if p in u.lower():
	legitimate_platform_count = legitimate_platform_count + 1
	break

	# 23. gov_edu_url_count (URLs ending in .gov or .edu)
	gov_edu_url_count = 0
	for u in urls:
	if u.lower().endswith('.gov') or u.lower().endswith('.edu'):
	gov_edu_url_count = gov_edu_url_count + 1

	# 24. question_mark_count (questions suggest conversation, ham signal)
	question_mark_count = text.count('?')

	features.append([exclamation_density, dollar_count, caps_word_ratio,
	spam_phrase_count, ham_phrase_count, net_spam_context,
	url_count, html_tag_count, email_length,
	avg_sentence_length, cap_ratio,
	has_specific_date, has_specific_time, date_reference_count,
	has_unsubscribe, has_physical_address, has_proper_greeting,
	has_contact_info, registration_language_score,
	cta_to_info_ratio, shortener_url_ratio,
	legitimate_platform_count, gov_edu_url_count,
	question_mark_count])
	return np.array(features)