| """
|
| Generate synthetic newsletter ham examples using Ollama.
|
| Produces 50 realistic government/institutional newsletter emails.
|
| Run: python generate_newsletters.py
|
| """
|
| import json
|
| import requests
|
| from pathlib import Path
|
|
|
| OLLAMA_API = 'http://localhost:11434/api/chat'
|
| OUTPUT_DIR = Path(__file__).parent / 'data' / 'raw' / 'newsletters'
|
| NUM_EMAILS = 50
|
|
|
| NEWSLETTER_TYPES = [
|
| "VA (Veterans Affairs) benefits update newsletter",
|
| "Social Security Administration monthly update",
|
| "IRS tax season reminder newsletter",
|
| "State university alumni association newsletter",
|
| "Public library community events newsletter",
|
| "City council weekly digest",
|
| "County health department wellness newsletter",
|
| "State DMV services update",
|
| "Federal student aid (FAFSA) reminder email",
|
| "National Park Service seasonal newsletter",
|
| ]
|
|
|
| PROMPT_TEMPLATE = """Write a realistic {newsletter_type} email.
|
|
|
| Requirements:
|
| - Include a Subject: line at the top
|
| - Use professional but accessible language
|
| - Include 2-4 links (use realistic placeholder URLs like https://www.va.gov/benefits)
|
| - Mention words like "free", "click here", "subscribe", "limited time" naturally where appropriate
|
| - Include a mix of uppercase headers and normal text
|
| - 150-400 words in the body
|
| - End with an unsubscribe notice
|
|
|
| Write ONLY the email (Subject line + body). No commentary. /no_think"""
|
|
|
|
|
| def generate_email(newsletter_type, model='qwen3.5:2b'):
|
| """Generate one synthetic newsletter email via Ollama."""
|
| prompt = PROMPT_TEMPLATE.format(newsletter_type=newsletter_type)
|
| try:
|
| resp = requests.post(OLLAMA_API, json={
|
| 'model': model,
|
| 'messages': [{'role': 'user', 'content': prompt}],
|
| 'stream': False,
|
| 'think': False,
|
| 'options': {'temperature': 0.8, 'num_predict': 600}
|
| }, timeout=120)
|
| if resp.status_code == 200:
|
| import re
|
| content = resp.json().get('message', {}).get('content', '')
|
| content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
|
| return content
|
| except Exception as e:
|
| print(f' Error: {e}')
|
| return None
|
|
|
|
|
| def main():
|
| OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
| try:
|
| resp = requests.get('http://localhost:11434/api/tags', timeout=5)
|
| models = [m['name'] for m in resp.json().get('models', [])]
|
| qwen = [m for m in models if 'qwen3.5' in m]
|
| if not qwen:
|
| print('No qwen3.5 model found in Ollama. Run: ollama pull qwen3.5:2b')
|
| return
|
| model = qwen[0]
|
| print(f'Using model: {model}')
|
| except Exception:
|
| print('Ollama not running. Start with: ollama serve')
|
| return
|
|
|
| existing = list(OUTPUT_DIR.glob('*.txt'))
|
| start_idx = len(existing)
|
| needed = NUM_EMAILS - start_idx
|
|
|
| if needed <= 0:
|
| print(f'Already have {len(existing)} newsletters. Done.')
|
| return
|
|
|
| print(f'Generating {needed} newsletters (have {start_idx}, target {NUM_EMAILS})...')
|
|
|
| for i in range(needed):
|
| ntype = NEWSLETTER_TYPES[i % len(NEWSLETTER_TYPES)]
|
| print(f' [{start_idx + i + 1}/{NUM_EMAILS}] {ntype}...')
|
| email = generate_email(ntype, model)
|
| if email and len(email) > 50:
|
| outfile = OUTPUT_DIR / f'newsletter_{start_idx + i:03d}.txt'
|
| outfile.write_text(email, encoding='utf-8')
|
| else:
|
| print(f' Skipped (too short or failed)')
|
|
|
| total = len(list(OUTPUT_DIR.glob('*.txt')))
|
| print(f'\nDone. {total} newsletter files in {OUTPUT_DIR}')
|
|
|
|
|
| if __name__ == '__main__':
|
| main()
|
|
|