spam-xai-model-v2 / generate_newsletters.py
VoltageVagabond's picture
Upload folder using huggingface_hub
960ec3d verified
raw
history blame
3.77 kB
"""
Generate synthetic newsletter ham examples using Ollama.
Produces 50 realistic government/institutional newsletter emails.
Run: python generate_newsletters.py
"""
import json
import requests
from pathlib import Path
OLLAMA_API = 'http://localhost:11434/api/chat'
OUTPUT_DIR = Path(__file__).parent / 'data' / 'raw' / 'newsletters'
NUM_EMAILS = 50
NEWSLETTER_TYPES = [
"VA (Veterans Affairs) benefits update newsletter",
"Social Security Administration monthly update",
"IRS tax season reminder newsletter",
"State university alumni association newsletter",
"Public library community events newsletter",
"City council weekly digest",
"County health department wellness newsletter",
"State DMV services update",
"Federal student aid (FAFSA) reminder email",
"National Park Service seasonal newsletter",
]
PROMPT_TEMPLATE = """Write a realistic {newsletter_type} email.
Requirements:
- Include a Subject: line at the top
- Use professional but accessible language
- Include 2-4 links (use realistic placeholder URLs like https://www.va.gov/benefits)
- Mention words like "free", "click here", "subscribe", "limited time" naturally where appropriate
- Include a mix of uppercase headers and normal text
- 150-400 words in the body
- End with an unsubscribe notice
Write ONLY the email (Subject line + body). No commentary. /no_think"""
def generate_email(newsletter_type, model='qwen3.5:2b'):
"""Generate one synthetic newsletter email via Ollama."""
prompt = PROMPT_TEMPLATE.format(newsletter_type=newsletter_type)
try:
resp = requests.post(OLLAMA_API, json={
'model': model,
'messages': [{'role': 'user', 'content': prompt}],
'stream': False,
'think': False,
'options': {'temperature': 0.8, 'num_predict': 600}
}, timeout=120)
if resp.status_code == 200:
import re
content = resp.json().get('message', {}).get('content', '')
content = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL).strip()
return content
except Exception as e:
print(f' Error: {e}')
return None
def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# Check Ollama availability
try:
resp = requests.get('http://localhost:11434/api/tags', timeout=5)
models = [m['name'] for m in resp.json().get('models', [])]
qwen = [m for m in models if 'qwen3.5' in m]
if not qwen:
print('No qwen3.5 model found in Ollama. Run: ollama pull qwen3.5:2b')
return
model = qwen[0]
print(f'Using model: {model}')
except Exception:
print('Ollama not running. Start with: ollama serve')
return
existing = list(OUTPUT_DIR.glob('*.txt'))
start_idx = len(existing)
needed = NUM_EMAILS - start_idx
if needed <= 0:
print(f'Already have {len(existing)} newsletters. Done.')
return
print(f'Generating {needed} newsletters (have {start_idx}, target {NUM_EMAILS})...')
for i in range(needed):
ntype = NEWSLETTER_TYPES[i % len(NEWSLETTER_TYPES)]
print(f' [{start_idx + i + 1}/{NUM_EMAILS}] {ntype}...')
email = generate_email(ntype, model)
if email and len(email) > 50:
outfile = OUTPUT_DIR / f'newsletter_{start_idx + i:03d}.txt'
outfile.write_text(email, encoding='utf-8')
else:
print(f' Skipped (too short or failed)')
total = len(list(OUTPUT_DIR.glob('*.txt')))
print(f'\nDone. {total} newsletter files in {OUTPUT_DIR}')
if __name__ == '__main__':
main()