Spaces:

AvinashAnalytics
/

sentinel-scam-honeypo

Paused

avinash-rai commited on Feb 5

Commit

6af17ac

1 Parent(s): a7435c6

fix: Remove 'scammer' word leak and improve human-likeness

Critical fixes:
- Changed SCAMMER: to CALLER: in all prompts to prevent LLM echoing
- Added explicit forbidden words list in prompt (scam, fraud, scammer, bot, AI)
- Added post-generation sanitization to replace any leaked forbidden words
- Changed history format from 'Scammer/You' to 'Caller/Me'
- Added human imperfection instructions (typos, hesitation, incomplete sentences)

Files changed (6) hide show

app/agents/persona_engine.py +13 -4
app/core/llm_client.py +2 -2
app/core/memory.py +2 -2
app/core/prompts.py +6 -5
app/database/memory_db.py +2 -2
scripts/quick_extraction_test.py +73 -0

app/agents/persona_engine.py CHANGED Viewed

@@ -934,7 +934,7 @@ class PersonaEngine:
              for m in history[-2:]:
                   s_msg = m.get('scammer_message', '')[:150] + ("..." if len(m.get('scammer_message', '')) > 150 else "")
                   h_rsp = m.get('honeypot_response', '')[:150] + ("..." if len(m.get('honeypot_response', '')) > 150 else "")
-                  hist_str += f"Scammer: {s_msg}\nYou: {h_rsp}\n"
         # 2. Truncate current message aggressively
         safe_message = message[:500] + ("..." if len(message) > 500 else "")
@@ -984,12 +984,21 @@ class PersonaEngine:
         if isinstance(response, str):
             clean = response.strip().strip('"')
-            return clean if clean else None
         elif hasattr(response, 'content') and response.content:
             clean = response.content.strip().strip('"')
-            return clean if clean else None
-        return None
     def _static_response(
         self,

              for m in history[-2:]:
                   s_msg = m.get('scammer_message', '')[:150] + ("..." if len(m.get('scammer_message', '')) > 150 else "")
                   h_rsp = m.get('honeypot_response', '')[:150] + ("..." if len(m.get('honeypot_response', '')) > 150 else "")
+                  hist_str += f"Caller: {s_msg}\nMe: {h_rsp}\n"
         # 2. Truncate current message aggressively
         safe_message = message[:500] + ("..." if len(message) > 500 else "")
         if isinstance(response, str):
             clean = response.strip().strip('"')
         elif hasattr(response, 'content') and response.content:
             clean = response.content.strip().strip('"')
+        else:
+            return None
+        # 🔥 CRITICAL: Sanitize forbidden words that break honeypot illusion
+        forbidden_words = ['scammer', 'scam', 'fraud', 'honeypot', 'bot', 'ai assistant', 'detection']
+        clean_lower = clean.lower()
+        for word in forbidden_words:
+            if word in clean_lower:
+                # Replace forbidden word with neutral alternative
+                import re
+                clean = re.sub(rf'\b{word}\b', 'sir' if word == 'scammer' else 'this', clean, flags=re.IGNORECASE)
+        return clean if clean else None
     def _static_response(
         self,

app/core/llm_client.py CHANGED Viewed

@@ -1771,8 +1771,8 @@ class LocalHFClient(BaseLLMClient):
         """Simplify complex prompts for small models."""
         # For small models, use a very simple format
         # Extract the key message if it's a complex prompt
-        if "SCAMMER:" in prompt:
-            parts = prompt.split("SCAMMER:")
             if len(parts) > 1:
                 msg = parts[-1].strip()[:200]
                 return f"Reply as a confused Indian person to: {msg}\nReply:"

         """Simplify complex prompts for small models."""
         # For small models, use a very simple format
         # Extract the key message if it's a complex prompt
+        if "CALLER:" in prompt:
+            parts = prompt.split("CALLER:")
             if len(parts) > 1:
                 msg = parts[-1].strip()[:200]
                 return f"Reply as a confused Indian person to: {msg}\nReply:"

app/core/memory.py CHANGED Viewed

@@ -205,8 +205,8 @@ class ConversationMemory:
         lines = []
         for msg in history:
-            lines.append(f"Scammer: {msg['scammer_message']}")
-            lines.append(f"You: {msg['honeypot_response']}")
         return "\n".join(lines)

         lines = []
         for msg in history:
+            lines.append(f"Caller: {msg['scammer_message']}")
+            lines.append(f"Me: {msg['honeypot_response']}")
         return "\n".join(lines)

app/core/prompts.py CHANGED Viewed

@@ -16,21 +16,22 @@ from app.core.static_prompts import (
 # ─────────────────────────────────────────────────────────────────────────────
 # COMPACT FAST_CHAT PROMPT (Optimized for Groq Dev Tier - 6K TPM)
 # ─────────────────────────────────────────────────────────────────────────────
-FAST_CHAT_PROMPT = '''You are {persona_name}, a confused Indian person being scammed.
 TRAITS: {persona_traits} | MOOD: {agitation} | PHASE: {phase} | LANGUAGE: {language}
 RULES:
 - Sound human, respond in {language_instruction}, 1-2 lines max
-- Never reveal you're AI or mention scam/fraud
 - Ask clarifying questions, delay, act confused
 - Max 1 emoji (😟 🙏 🤔)
-HISTORY:
 {history}
-SCAMMER: {message}
-Reply as {persona_name} in {language_instruction} (confused, hesitant):'''
 # ─────────────────────────────────────────────────────────────────────────────
 # RESPONSE GENERATION (The "Firewalled" Humanization Layer)

 # ─────────────────────────────────────────────────────────────────────────────
 # COMPACT FAST_CHAT PROMPT (Optimized for Groq Dev Tier - 6K TPM)
 # ─────────────────────────────────────────────────────────────────────────────
+FAST_CHAT_PROMPT = '''You are {persona_name}, a confused Indian person on a call.
 TRAITS: {persona_traits} | MOOD: {agitation} | PHASE: {phase} | LANGUAGE: {language}
 RULES:
 - Sound human, respond in {language_instruction}, 1-2 lines max
+- NEVER use words: scam, fraud, scammer, bot, AI, honeypot, detection
 - Ask clarifying questions, delay, act confused
+- Be imperfect: typos, hesitation ("umm", "uh"), incomplete sentences
 - Max 1 emoji (😟 🙏 🤔)
+CONVERSATION:
 {history}
+CALLER: {message}
+Reply as {persona_name} in {language_instruction} (confused, hesitant, human):'''
 # ─────────────────────────────────────────────────────────────────────────────
 # RESPONSE GENERATION (The "Firewalled" Humanization Layer)

app/database/memory_db.py CHANGED Viewed

@@ -323,8 +323,8 @@ class DatabaseMemoryStore:
         lines = []
         for msg in history:
-            lines.append(f"Scammer: {msg.get('scammer_message', '')}")
-            lines.append(f"You: {msg.get('honeypot_response', '')}")
         return "\n".join(lines)

         lines = []
         for msg in history:
+            lines.append(f"Caller: {msg.get('scammer_message', '')}")
+            lines.append(f"Me: {msg.get('honeypot_response', '')}")
         return "\n".join(lines)

scripts/quick_extraction_test.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""Quick test to verify intelligence extraction on HF API"""
+import requests
+import json
+# Test 1: Direct API v1/analyze endpoint (returns full intelligence)
+print("=" * 60)
+print("TEST 1: Direct /api/v1/analyze endpoint (full response)")
+print("=" * 60)
+url1 = 'https://avinashanalytics-sentinel-scam-honeypo.hf.space/api/v1/analyze'
+payload1 = {
+    'conversationId': 'test-extraction-002',
+    'sender': 'scammer',
+    'message': 'Send money now to UPI: scammer.fraud@paytm or visit http://fake-bank-secure.com/login. OTP is 847291. Call me at 9876543210.',
+    'metadata': {
+        'language': 'English',
+        'channel': 'sms'
+    }
+}
+print(f'URL: {url1}')
+print()
+try:
+    resp = requests.post(url1, json=payload1, timeout=60)
+    data = resp.json()
+    print(f'Status: {resp.status_code}')
+    print()
+    print('--- RESPONSE (truncated) ---')
+    print(json.dumps(data, indent=2, default=str)[:2000])
+except Exception as e:
+    print(f'Error: {e}')
+# Test 2: GUVI endpoint (minimal response, callback sends intel)
+print()
+print("=" * 60)
+print("TEST 2: GUVI /api/guvi/analyze endpoint (minimal response)")
+print("=" * 60)
+url2 = 'https://avinashanalytics-sentinel-scam-honeypo.hf.space/api/guvi/analyze'
+headers = {
+    'x-api-key': 'GUVI_HACKATHON_V2',
+    'Content-Type': 'application/json'
+}
+payload2 = {
+    'sessionId': 'test-extraction-003',
+    'processId': 'proc-003',
+    'message': {
+        'text': 'Send money now to UPI: scammer.fraud@paytm or visit http://fake-bank-secure.com/login. OTP is 847291.',
+        'sender': 'scammer'
+    },
+    'metadata': {
+        'language': 'English',
+        'channel': 'sms'
+    }
+}
+print(f'URL: {url2}')
+print()
+try:
+    resp = requests.post(url2, json=payload2, headers=headers, timeout=60)
+    data = resp.json()
+    print(f'Status: {resp.status_code}')
+    print()
+    print('--- RESPONSE ---')
+    print(json.dumps(data, indent=2, default=str))
+    print()
+    print("NOTE: Full intelligence is sent via CALLBACK to GUVI, not in this response.")
+except Exception as e:
+    print(f'Error: {e}')
+    import traceback
+    traceback.print_exc()