# Generate synthetic newsletter emails using Ollama # ENGT 375 Project - Spring 2026 - ODU # I created this script because my model was classifying newsletters as spam - # it needed more examples of legitimate promotional-style emails in the training data import re import json import requests from pathlib import Path from utils_student import OLLAMA_API OUTPUT_DIR = Path(__file__).parent / 'data' / 'raw' / 'newsletters' NUM_EMAILS = 600 NEWSLETTER_TYPES = [ # Government / institutional "VA (Veterans Affairs) benefits update newsletter", "Social Security Administration monthly update", "IRS tax season reminder newsletter", "State university alumni association newsletter", "Public library community events newsletter", "City council weekly digest", "County health department wellness newsletter", "State DMV services update", "Federal student aid (FAFSA) reminder email", "National Park Service seasonal newsletter", # Community / sports / events "Eventbrite event registration for a local 5K charity run", "Meetup.com weekly digest for a Python developer group", "Local rec league sports schedule and registration reminder", "Community theater ticket sale announcement", "Youth soccer club seasonal registration email", "Local gym membership promotion with class schedule", "Cycling club annual group ride and event announcement", "Neighborhood association monthly events digest", "University student organization weekly newsletter", "Non-profit volunteer opportunity announcement", "Chamber of commerce networking event invitation", "Adult softball league spring registration reminder", "Book club monthly selection and meeting announcement", "Charity fundraising gala ticket sale email", "Local farmers market vendor and schedule newsletter", # Modern commercial / transactional (common false positives) "Steam / gaming platform wishlist sale and purchase notification", "Amazon / eBay / Etsy order confirmation and shipping update", "GitHub Actions CI notification and pull request comment", "Netflix / Spotify subscription renewal receipt", "Airbnb / hotel booking confirmation and check-in reminder", "LinkedIn job alert and connection request notification", "Bank account activity notification (Chase / Bank of America style)", "UPS / FedEx / USPS shipping notification with tracking number", "Zoom / Microsoft Teams meeting invitation and reminder", "University Canvas / Blackboard grade notification and assignment reminder", ] PROMPT_TEMPLATE = """Write a realistic {newsletter_type} email. Requirements: - Include a Subject: line at the top - Use professional but accessible language - Include 2-4 links (use realistic placeholder URLs like https://www.va.gov/benefits or https://www.eventbrite.com/e/example) - Mention words like "free", "click here", "subscribe", "limited time", "register now", "sign up" naturally where appropriate - Include a mix of uppercase headers and normal text - Include a specific date and time (e.g., "Saturday, May 14, 2026 at 9:00 AM") - Include a specific location or venue (e.g., "Virginia Beach Convention Center, 1000 19th St") - Include an organization name and a contact person name - Include a physical mailing address in the footer (e.g., "123 Main St, Suite 200, Norfolk, VA 23510") - 150-400 words in the body - End with an unsubscribe notice Write ONLY the email (Subject line + body). No commentary. /no_think""" # Generate one synthetic newsletter email via Ollama def generate_email(newsletter_type, model='gemma3:1b'): prompt = PROMPT_TEMPLATE.format(newsletter_type=newsletter_type) try: resp = requests.post(OLLAMA_API, json={ 'model': model, 'messages': [{'role': 'user', 'content': prompt}], 'stream': False, 'think': False, 'options': {'temperature': 0.8, 'num_predict': 600} }, timeout=120) if resp.status_code == 200: content = resp.json().get('message', {}).get('content', '') content = re.sub(r'.*?', '', content, flags=re.DOTALL).strip() return content except (requests.RequestException, ValueError, KeyError) as e: print(' Error generating email: %s' % e) return None # Main function to generate all newsletters def main(): print('Starting newsletter generation...') OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Check Ollama availability try: resp = requests.get('http://localhost:11434/api/tags', timeout=5) models_data = resp.json().get('models', []) models = [] for m in models_data: models.append(m['name']) # Prefer gemma3:1b, fall back to qwen3.5 gemma = [] for m in models: if 'gemma3' in m: gemma.append(m) qwen = [] for m in models: if 'qwen3.5' in m: qwen.append(m) preferred = gemma or qwen if not preferred: print('No gemma3 or qwen3.5 model found. Run: ollama pull gemma3:1b') return model = preferred[0] print('Using model: %s' % model) except (requests.RequestException, ValueError, KeyError) as e: print('Ollama not running (%s). Start with: ollama serve' % e) return existing = list(OUTPUT_DIR.glob('*.txt')) start_idx = len(existing) needed = NUM_EMAILS - start_idx if needed <= 0: print('Already have %d newsletters. Done.' % len(existing)) return print('Generating %d newsletters (have %d, target %d)...' % (needed, start_idx, NUM_EMAILS)) for i in range(needed): ntype = NEWSLETTER_TYPES[i % len(NEWSLETTER_TYPES)] print(' [%d/%d] %s...' % (start_idx + i + 1, NUM_EMAILS, ntype)) email = generate_email(ntype, model) if email and len(email) > 50: outfile = OUTPUT_DIR / ('newsletter_%03d.txt' % (start_idx + i)) with open(outfile, 'w', encoding='utf-8') as f: f.write(email) else: print(' Skipped (too short or failed)') total = len(list(OUTPUT_DIR.glob('*.txt'))) print('\nDone. %d newsletter files in %s' % (total, OUTPUT_DIR)) print('Done generating!') if __name__ == '__main__': main()