""" Generate synthetic newsletter ham examples using Ollama. Produces 50 realistic government/institutional newsletter emails. Run: python generate_newsletters.py """ import json import requests from pathlib import Path OLLAMA_API = 'http://localhost:11434/api/chat' OUTPUT_DIR = Path(__file__).parent / 'data' / 'raw' / 'newsletters' NUM_EMAILS = 50 NEWSLETTER_TYPES = [ "VA (Veterans Affairs) benefits update newsletter", "Social Security Administration monthly update", "IRS tax season reminder newsletter", "State university alumni association newsletter", "Public library community events newsletter", "City council weekly digest", "County health department wellness newsletter", "State DMV services update", "Federal student aid (FAFSA) reminder email", "National Park Service seasonal newsletter", ] PROMPT_TEMPLATE = """Write a realistic {newsletter_type} email. Requirements: - Include a Subject: line at the top - Use professional but accessible language - Include 2-4 links (use realistic placeholder URLs like https://www.va.gov/benefits) - Mention words like "free", "click here", "subscribe", "limited time" naturally where appropriate - Include a mix of uppercase headers and normal text - 150-400 words in the body - End with an unsubscribe notice Write ONLY the email (Subject line + body). No commentary. /no_think""" def generate_email(newsletter_type, model='qwen3.5:2b'): """Generate one synthetic newsletter email via Ollama.""" prompt = PROMPT_TEMPLATE.format(newsletter_type=newsletter_type) try: resp = requests.post(OLLAMA_API, json={ 'model': model, 'messages': [{'role': 'user', 'content': prompt}], 'stream': False, 'think': False, 'options': {'temperature': 0.8, 'num_predict': 600} }, timeout=120) if resp.status_code == 200: import re content = resp.json().get('message', {}).get('content', '') content = re.sub(r'.*?', '', content, flags=re.DOTALL).strip() return content except Exception as e: print(f' Error: {e}') return None def main(): OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Check Ollama availability try: resp = requests.get('http://localhost:11434/api/tags', timeout=5) models = [m['name'] for m in resp.json().get('models', [])] qwen = [m for m in models if 'qwen3.5' in m] if not qwen: print('No qwen3.5 model found in Ollama. Run: ollama pull qwen3.5:2b') return model = qwen[0] print(f'Using model: {model}') except Exception: print('Ollama not running. Start with: ollama serve') return existing = list(OUTPUT_DIR.glob('*.txt')) start_idx = len(existing) needed = NUM_EMAILS - start_idx if needed <= 0: print(f'Already have {len(existing)} newsletters. Done.') return print(f'Generating {needed} newsletters (have {start_idx}, target {NUM_EMAILS})...') for i in range(needed): ntype = NEWSLETTER_TYPES[i % len(NEWSLETTER_TYPES)] print(f' [{start_idx + i + 1}/{NUM_EMAILS}] {ntype}...') email = generate_email(ntype, model) if email and len(email) > 50: outfile = OUTPUT_DIR / f'newsletter_{start_idx + i:03d}.txt' outfile.write_text(email, encoding='utf-8') else: print(f' Skipped (too short or failed)') total = len(list(OUTPUT_DIR.glob('*.txt'))) print(f'\nDone. {total} newsletter files in {OUTPUT_DIR}') if __name__ == '__main__': main()