avinash-rai commited on
Commit
6af17ac
Β·
1 Parent(s): a7435c6

fix: Remove 'scammer' word leak and improve human-likeness

Browse files

Critical fixes:
- Changed SCAMMER: to CALLER: in all prompts to prevent LLM echoing
- Added explicit forbidden words list in prompt (scam, fraud, scammer, bot, AI)
- Added post-generation sanitization to replace any leaked forbidden words
- Changed history format from 'Scammer/You' to 'Caller/Me'
- Added human imperfection instructions (typos, hesitation, incomplete sentences)

app/agents/persona_engine.py CHANGED
@@ -934,7 +934,7 @@ class PersonaEngine:
934
  for m in history[-2:]:
935
  s_msg = m.get('scammer_message', '')[:150] + ("..." if len(m.get('scammer_message', '')) > 150 else "")
936
  h_rsp = m.get('honeypot_response', '')[:150] + ("..." if len(m.get('honeypot_response', '')) > 150 else "")
937
- hist_str += f"Scammer: {s_msg}\nYou: {h_rsp}\n"
938
 
939
  # 2. Truncate current message aggressively
940
  safe_message = message[:500] + ("..." if len(message) > 500 else "")
@@ -984,12 +984,21 @@ class PersonaEngine:
984
 
985
  if isinstance(response, str):
986
  clean = response.strip().strip('"')
987
- return clean if clean else None
988
  elif hasattr(response, 'content') and response.content:
989
  clean = response.content.strip().strip('"')
990
- return clean if clean else None
 
 
 
 
 
 
 
 
 
 
991
 
992
- return None
993
 
994
  def _static_response(
995
  self,
 
934
  for m in history[-2:]:
935
  s_msg = m.get('scammer_message', '')[:150] + ("..." if len(m.get('scammer_message', '')) > 150 else "")
936
  h_rsp = m.get('honeypot_response', '')[:150] + ("..." if len(m.get('honeypot_response', '')) > 150 else "")
937
+ hist_str += f"Caller: {s_msg}\nMe: {h_rsp}\n"
938
 
939
  # 2. Truncate current message aggressively
940
  safe_message = message[:500] + ("..." if len(message) > 500 else "")
 
984
 
985
  if isinstance(response, str):
986
  clean = response.strip().strip('"')
 
987
  elif hasattr(response, 'content') and response.content:
988
  clean = response.content.strip().strip('"')
989
+ else:
990
+ return None
991
+
992
+ # πŸ”₯ CRITICAL: Sanitize forbidden words that break honeypot illusion
993
+ forbidden_words = ['scammer', 'scam', 'fraud', 'honeypot', 'bot', 'ai assistant', 'detection']
994
+ clean_lower = clean.lower()
995
+ for word in forbidden_words:
996
+ if word in clean_lower:
997
+ # Replace forbidden word with neutral alternative
998
+ import re
999
+ clean = re.sub(rf'\b{word}\b', 'sir' if word == 'scammer' else 'this', clean, flags=re.IGNORECASE)
1000
 
1001
+ return clean if clean else None
1002
 
1003
  def _static_response(
1004
  self,
app/core/llm_client.py CHANGED
@@ -1771,8 +1771,8 @@ class LocalHFClient(BaseLLMClient):
1771
  """Simplify complex prompts for small models."""
1772
  # For small models, use a very simple format
1773
  # Extract the key message if it's a complex prompt
1774
- if "SCAMMER:" in prompt:
1775
- parts = prompt.split("SCAMMER:")
1776
  if len(parts) > 1:
1777
  msg = parts[-1].strip()[:200]
1778
  return f"Reply as a confused Indian person to: {msg}\nReply:"
 
1771
  """Simplify complex prompts for small models."""
1772
  # For small models, use a very simple format
1773
  # Extract the key message if it's a complex prompt
1774
+ if "CALLER:" in prompt:
1775
+ parts = prompt.split("CALLER:")
1776
  if len(parts) > 1:
1777
  msg = parts[-1].strip()[:200]
1778
  return f"Reply as a confused Indian person to: {msg}\nReply:"
app/core/memory.py CHANGED
@@ -205,8 +205,8 @@ class ConversationMemory:
205
  lines = []
206
 
207
  for msg in history:
208
- lines.append(f"Scammer: {msg['scammer_message']}")
209
- lines.append(f"You: {msg['honeypot_response']}")
210
 
211
  return "\n".join(lines)
212
 
 
205
  lines = []
206
 
207
  for msg in history:
208
+ lines.append(f"Caller: {msg['scammer_message']}")
209
+ lines.append(f"Me: {msg['honeypot_response']}")
210
 
211
  return "\n".join(lines)
212
 
app/core/prompts.py CHANGED
@@ -16,21 +16,22 @@ from app.core.static_prompts import (
16
  # ─────────────────────────────────────────────────────────────────────────────
17
  # COMPACT FAST_CHAT PROMPT (Optimized for Groq Dev Tier - 6K TPM)
18
  # ─────────────────────────────────────────────────────────────────────────────
19
- FAST_CHAT_PROMPT = '''You are {persona_name}, a confused Indian person being scammed.
20
  TRAITS: {persona_traits} | MOOD: {agitation} | PHASE: {phase} | LANGUAGE: {language}
21
 
22
  RULES:
23
  - Sound human, respond in {language_instruction}, 1-2 lines max
24
- - Never reveal you're AI or mention scam/fraud
25
  - Ask clarifying questions, delay, act confused
 
26
  - Max 1 emoji (😟 πŸ™ πŸ€”)
27
 
28
- HISTORY:
29
  {history}
30
 
31
- SCAMMER: {message}
32
 
33
- Reply as {persona_name} in {language_instruction} (confused, hesitant):'''
34
 
35
  # ─────────────────────────────────────────────────────────────────────────────
36
  # RESPONSE GENERATION (The "Firewalled" Humanization Layer)
 
16
  # ─────────────────────────────────────────────────────────────────────────────
17
  # COMPACT FAST_CHAT PROMPT (Optimized for Groq Dev Tier - 6K TPM)
18
  # ─────────────────────────────────────────────────────────────────────────────
19
+ FAST_CHAT_PROMPT = '''You are {persona_name}, a confused Indian person on a call.
20
  TRAITS: {persona_traits} | MOOD: {agitation} | PHASE: {phase} | LANGUAGE: {language}
21
 
22
  RULES:
23
  - Sound human, respond in {language_instruction}, 1-2 lines max
24
+ - NEVER use words: scam, fraud, scammer, bot, AI, honeypot, detection
25
  - Ask clarifying questions, delay, act confused
26
+ - Be imperfect: typos, hesitation ("umm", "uh"), incomplete sentences
27
  - Max 1 emoji (😟 πŸ™ πŸ€”)
28
 
29
+ CONVERSATION:
30
  {history}
31
 
32
+ CALLER: {message}
33
 
34
+ Reply as {persona_name} in {language_instruction} (confused, hesitant, human):'''
35
 
36
  # ─────────────────────────────────────────────────────────────────────────────
37
  # RESPONSE GENERATION (The "Firewalled" Humanization Layer)
app/database/memory_db.py CHANGED
@@ -323,8 +323,8 @@ class DatabaseMemoryStore:
323
  lines = []
324
 
325
  for msg in history:
326
- lines.append(f"Scammer: {msg.get('scammer_message', '')}")
327
- lines.append(f"You: {msg.get('honeypot_response', '')}")
328
 
329
  return "\n".join(lines)
330
 
 
323
  lines = []
324
 
325
  for msg in history:
326
+ lines.append(f"Caller: {msg.get('scammer_message', '')}")
327
+ lines.append(f"Me: {msg.get('honeypot_response', '')}")
328
 
329
  return "\n".join(lines)
330
 
scripts/quick_extraction_test.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Quick test to verify intelligence extraction on HF API"""
2
+ import requests
3
+ import json
4
+
5
+ # Test 1: Direct API v1/analyze endpoint (returns full intelligence)
6
+ print("=" * 60)
7
+ print("TEST 1: Direct /api/v1/analyze endpoint (full response)")
8
+ print("=" * 60)
9
+
10
+ url1 = 'https://avinashanalytics-sentinel-scam-honeypo.hf.space/api/v1/analyze'
11
+ payload1 = {
12
+ 'conversationId': 'test-extraction-002',
13
+ 'sender': 'scammer',
14
+ 'message': 'Send money now to UPI: scammer.fraud@paytm or visit http://fake-bank-secure.com/login. OTP is 847291. Call me at 9876543210.',
15
+ 'metadata': {
16
+ 'language': 'English',
17
+ 'channel': 'sms'
18
+ }
19
+ }
20
+
21
+ print(f'URL: {url1}')
22
+ print()
23
+
24
+ try:
25
+ resp = requests.post(url1, json=payload1, timeout=60)
26
+ data = resp.json()
27
+ print(f'Status: {resp.status_code}')
28
+ print()
29
+ print('--- RESPONSE (truncated) ---')
30
+ print(json.dumps(data, indent=2, default=str)[:2000])
31
+ except Exception as e:
32
+ print(f'Error: {e}')
33
+
34
+ # Test 2: GUVI endpoint (minimal response, callback sends intel)
35
+ print()
36
+ print("=" * 60)
37
+ print("TEST 2: GUVI /api/guvi/analyze endpoint (minimal response)")
38
+ print("=" * 60)
39
+
40
+ url2 = 'https://avinashanalytics-sentinel-scam-honeypo.hf.space/api/guvi/analyze'
41
+ headers = {
42
+ 'x-api-key': 'GUVI_HACKATHON_V2',
43
+ 'Content-Type': 'application/json'
44
+ }
45
+ payload2 = {
46
+ 'sessionId': 'test-extraction-003',
47
+ 'processId': 'proc-003',
48
+ 'message': {
49
+ 'text': 'Send money now to UPI: scammer.fraud@paytm or visit http://fake-bank-secure.com/login. OTP is 847291.',
50
+ 'sender': 'scammer'
51
+ },
52
+ 'metadata': {
53
+ 'language': 'English',
54
+ 'channel': 'sms'
55
+ }
56
+ }
57
+
58
+ print(f'URL: {url2}')
59
+ print()
60
+
61
+ try:
62
+ resp = requests.post(url2, json=payload2, headers=headers, timeout=60)
63
+ data = resp.json()
64
+ print(f'Status: {resp.status_code}')
65
+ print()
66
+ print('--- RESPONSE ---')
67
+ print(json.dumps(data, indent=2, default=str))
68
+ print()
69
+ print("NOTE: Full intelligence is sent via CALLBACK to GUVI, not in this response.")
70
+ except Exception as e:
71
+ print(f'Error: {e}')
72
+ import traceback
73
+ traceback.print_exc()