|
|
|
|
|
|
|
|
|
|
| import re
|
| import numpy as np
|
| from nltk.corpus import stopwords
|
| from nltk.stem import PorterStemmer
|
|
|
|
|
| OLLAMA_API = 'http://localhost:11434/api/chat'
|
|
|
|
|
| LLM_FEATURE_NAMES = ['intent_promotional', 'intent_transactional', 'intent_personal',
|
| 'intent_phishing', 'tone_urgency', 'tone_formality']
|
|
|
|
|
|
|
| spam_context_phrases = [
|
| 'act now', 'limited time', 'click to claim', 'you have won',
|
| 'wire transfer', 'bank account', 'million dollar', 'free gift',
|
| 'no prescription', 'buy now', 'make money fast', 'lose weight',
|
| 'casino', 'free credit'
|
| ]
|
|
|
| ham_context_phrases = [
|
| 'click to unsubscribe', 'unsubscribe from', 'to opt out',
|
| 'this newsletter', 'you are receiving this', 'official notice',
|
| 'department of', 'office of', 'subscribe to updates',
|
| 'manage your subscription', 'privacy policy', 'government website',
|
| 'register now', 'sign up', 'reserve your spot', 'rsvp',
|
| 'event details', 'schedule', 'agenda', 'venue',
|
| 'annual', 'edition', 'season', 'community'
|
| ]
|
|
|
|
|
| registration_phrases = [
|
| 'register now', 'sign up', 'reserve your spot', 'rsvp',
|
| 'registration open', 'tickets available', 'limited capacity',
|
| 'early bird', 'reserve your seat', 'join us'
|
| ]
|
|
|
|
|
| url_shorteners = [
|
| 'bit.ly', 'tinyurl.com', 'goo.gl', 't.co', 'ow.ly',
|
| 'is.gd', 'buff.ly', 'adf.ly', 'shorte.st'
|
| ]
|
|
|
|
|
| legitimate_platforms = [
|
| 'eventbrite.com', 'meetup.com', 'mailchimp.com', 'constantcontact.com',
|
| 'surveymonkey.com', 'google.com', 'zoom.us', 'microsoft.com',
|
| 'linkedin.com', 'facebook.com', 'github.com', 'youtube.com',
|
| 'steampowered.com', 'store.steampowered.com', 'paypal.com', 'stripe.com',
|
| 'shopify.com', 'etsy.com', 'bestbuy.com', 'target.com',
|
| 'amazon.com', 'netflix.com', 'spotify.com'
|
| ]
|
|
|
| stemmer = PorterStemmer()
|
| stop_words = set(stopwords.words('english'))
|
|
|
|
|
|
|
| def preprocess_text(text):
|
| text = re.sub(r'<[^>]+>', ' ', text)
|
| text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
|
| text = re.sub(r'\S+@\S+', ' ', text)
|
| text = re.sub(r'[^a-zA-Z\s]', ' ', text)
|
| text = text.lower()
|
| tokens = text.split()
|
| result = []
|
| for w in tokens:
|
| if w not in stop_words and len(w) > 2:
|
| result.append(stemmer.stem(w))
|
| return ' '.join(result)
|
|
|
|
|
|
|
|
|
|
|
| def compute_metadata_features(texts):
|
| features = []
|
| for text in texts:
|
|
|
| sentences = re.split(r'[.!?]+', text)
|
| sentence_count = max(len([s for s in sentences if s.strip()]), 1)
|
|
|
|
|
| exclamation_density = text.count('!') / sentence_count
|
|
|
|
|
| dollar_count = text.count('$')
|
|
|
|
|
| words = text.split()
|
| caps_words = []
|
| for w in words:
|
| if w.isupper() and len(w) > 1:
|
| caps_words.append(w)
|
| caps_word_ratio = len(caps_words) / max(len(words), 1)
|
|
|
|
|
| text_lower = text.lower()
|
| spam_phrase_count = 0
|
| for p in spam_context_phrases:
|
| if p in text_lower:
|
| spam_phrase_count = spam_phrase_count + 1
|
|
|
|
|
| ham_phrase_count = 0
|
| for p in ham_context_phrases:
|
| if p in text_lower:
|
| ham_phrase_count = ham_phrase_count + 1
|
|
|
|
|
| net_spam_context = spam_phrase_count - ham_phrase_count
|
|
|
|
|
| url_count = len(re.findall(r'https?://\S+|www\.\S+', text))
|
|
|
|
|
| html_tag_count = len(re.findall(r'<[^>]+>', text))
|
|
|
|
|
| email_length = len(text)
|
|
|
|
|
| avg_sentence_length = len(text) / sentence_count
|
|
|
|
|
| alpha_chars = []
|
| for c in text:
|
| if c.isalpha():
|
| alpha_chars.append(c)
|
| upper_count = 0
|
| for c in alpha_chars:
|
| if c.isupper():
|
| upper_count = upper_count + 1
|
| cap_ratio = upper_count / max(len(alpha_chars), 1)
|
|
|
|
|
|
|
|
|
| date_patterns = [
|
| r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\b',
|
| r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2}',
|
| r'\b\d{1,2}/\d{1,2}/\d{2,4}\b',
|
| r'\b\d{4}-\d{2}-\d{2}\b'
|
| ]
|
| has_specific_date = 0
|
| for pat in date_patterns:
|
| if re.search(pat, text, re.IGNORECASE):
|
| has_specific_date = 1
|
| break
|
|
|
|
|
| has_specific_time = 1 if re.search(r'\b\d{1,2}:\d{2}\s*(?:AM|PM|am|pm|a\.m\.|p\.m\.)\b', text) else 0
|
|
|
|
|
| months = ['january', 'february', 'march', 'april', 'may', 'june',
|
| 'july', 'august', 'september', 'october', 'november', 'december']
|
| date_reference_count = 0
|
| for m in months:
|
| date_reference_count = date_reference_count + len(re.findall(r'\b' + m + r'\b', text_lower))
|
|
|
|
|
| has_unsubscribe = 1 if re.search(r'unsubscribe|opt.out', text_lower) else 0
|
|
|
|
|
| has_physical_address = 1 if re.search(r'\d+\s+\w+\s+(?:St|Street|Ave|Avenue|Blvd|Boulevard|Dr|Drive|Rd|Road|Ln|Lane|Way|Ct|Court)\b', text, re.IGNORECASE) else 0
|
|
|
|
|
| has_proper_greeting = 1 if re.search(r'^(?:Dear|Hello|Hi|Good morning|Good afternoon)\s+\w', text.strip(), re.IGNORECASE) else 0
|
|
|
|
|
| has_phone = bool(re.search(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text))
|
| has_email_addr = bool(re.search(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text))
|
| has_contact_info = 1 if (has_phone or has_email_addr) else 0
|
|
|
|
|
| registration_language_score = 0
|
| for phrase in registration_phrases:
|
| if phrase in text_lower:
|
| registration_language_score = registration_language_score + 1
|
|
|
|
|
| cta_words = ['buy', 'order', 'click', 'act', 'hurry', 'rush', 'grab', 'claim']
|
| info_words = ['schedule', 'agenda', 'details', 'information', 'about', 'learn',
|
| 'location', 'venue', 'date', 'time', 'address', 'contact']
|
| cta_count = 0
|
| for w in cta_words:
|
| cta_count = cta_count + text_lower.split().count(w)
|
| info_count = 0
|
| for w in info_words:
|
| info_count = info_count + text_lower.split().count(w)
|
| cta_to_info_ratio = cta_count / max(info_count, 1)
|
|
|
|
|
| urls = re.findall(r'https?://([^\s/]+)', text)
|
| shortener_count = 0
|
| for u in urls:
|
| for s in url_shorteners:
|
| if s in u.lower():
|
| shortener_count = shortener_count + 1
|
| break
|
| shortener_url_ratio = shortener_count / max(len(urls), 1)
|
|
|
|
|
| legitimate_platform_count = 0
|
| for u in urls:
|
| for p in legitimate_platforms:
|
| if p in u.lower():
|
| legitimate_platform_count = legitimate_platform_count + 1
|
| break
|
|
|
|
|
| gov_edu_url_count = 0
|
| for u in urls:
|
| if u.lower().endswith('.gov') or u.lower().endswith('.edu'):
|
| gov_edu_url_count = gov_edu_url_count + 1
|
|
|
|
|
| question_mark_count = text.count('?')
|
|
|
| features.append([exclamation_density, dollar_count, caps_word_ratio,
|
| spam_phrase_count, ham_phrase_count, net_spam_context,
|
| url_count, html_tag_count, email_length,
|
| avg_sentence_length, cap_ratio,
|
| has_specific_date, has_specific_time, date_reference_count,
|
| has_unsubscribe, has_physical_address, has_proper_greeting,
|
| has_contact_info, registration_language_score,
|
| cta_to_info_ratio, shortener_url_ratio,
|
| legitimate_platform_count, gov_edu_url_count,
|
| question_mark_count])
|
| return np.array(features)
|
|
|