NithinReddyG's picture
remove emotion-library moves + praise; tune detection thresholds
5a72b21
raw
history blame contribute delete
22.8 kB
"""Text-to-speech and LLM response generation."""
import logging
from .config import PERSONALITIES, get_random_personality
logger = logging.getLogger(__name__)
DEFAULT_MODEL_FAST = "llama-3.1-8b-instant"
DEFAULT_MODEL_SMART = "llama-3.3-70b-versatile"
# Vocal direction tags supported by Canopy Orpheus on Groq. Inline in the input
# text (e.g. "[angry] Put it down."), Orpheus inflects the rest of the line.
ORPHEUS_TAGS = ("cheerful", "happy", "excited", "angry", "frustrated", "sad", "whispered", "shouting", "neutral")
import re as _re
_ORPHEUS_TAG_RE = _re.compile(r"\[(" + "|".join(ORPHEUS_TAGS) + r")\]\s*", flags=_re.IGNORECASE)
def strip_orpheus_tags(text: str) -> str:
"""Remove [tag] markers from text — used when sending to non-Orpheus TTS so
Edge/ElevenLabs don't speak 'bracket cheerful bracket' verbatim.
"""
return _ORPHEUS_TAG_RE.sub("", text or "").strip()
class LLMResponder:
"""Generate snarky responses using Groq (free) or fallback to pre-written."""
def __init__(
self,
api_key: str = "",
personality: str = "mixtape",
user_name: str = "",
custom_prompt: str = "",
model: str = "",
enable_orpheus_tags: bool = False,
):
self.api_key = api_key
self.client = None
self.personality = personality
self.user_name = (user_name or "").strip()
self.custom_prompt = (custom_prompt or "").strip()
self.enable_orpheus_tags = bool(enable_orpheus_tags)
# Auto-pick: smarter model when a custom prompt is provided (needs better
# instruction-following); fast model otherwise. Caller can override.
self.model = model.strip() if model else (
DEFAULT_MODEL_SMART if self.custom_prompt else DEFAULT_MODEL_FAST
)
if api_key:
try:
from groq import Groq
self.client = Groq(api_key=api_key)
logger.info(
f"Groq LLM initialized: personality={PERSONALITIES.get(personality, {}).get('name', personality)}"
f" model={self.model}"
f" name={'SET' if self.user_name else 'UNSET'}"
f" custom_prompt={'SET' if self.custom_prompt else 'UNSET'}"
)
except ImportError:
logger.warning("groq package not installed, using pre-written lines")
except Exception as e:
logger.warning(f"Groq init failed: {e}, using pre-written lines")
def _user_overlay(self) -> str:
"""Build the 'name + custom user directive' overlay that rides on every Groq call.
The custom prompt is given top billing — it can change tone, vocabulary, and
even override personality defaults. Personality is still applied below as
secondary tone guidance.
"""
parts = []
if self.user_name:
parts.append(f"The user's name is {self.user_name}. Address them by name occasionally.")
if self.custom_prompt:
parts.append("USER DIRECTIVE (highest priority — follow this above all other tone rules):")
parts.append(self.custom_prompt)
return "\n".join(parts).strip()
def get_response(self, phone_count: int, context: str = "") -> str:
"""Get a snarky response about phone usage."""
# Fallback to personality-specific pre-written if no API
if not self.client:
return self._get_prewritten_shame()
try:
# Get personality - if mixtape, randomly pick one
if self.personality == "mixtape":
actual_personality = get_random_personality()
else:
actual_personality = self.personality
personality_data = PERSONALITIES.get(actual_personality, PERSONALITIES["angry_boss"])
# Build personality prompt from structured data
shame_data = personality_data["shame"]
voice_desc = personality_data["voice"]
avoid = personality_data.get("avoid", "")
# Construct prompt from structured data
personality_prompt = f"""{voice_desc}
TONE: {shame_data['tone']}
STRUCTURE: {shame_data['structure']}
EXAMPLES:
{chr(10).join('- ' + ex for ex in shame_data['examples'])}
AVOID: {avoid}"""
# Build context based on count
if phone_count == 1:
context_hint = "First time today."
elif phone_count == 2:
context_hint = "Second time."
elif phone_count == 3:
context_hint = "Third time."
elif phone_count <= 5:
context_hint = f"{phone_count} times now."
else:
context_hint = f"{phone_count} times today!"
overlay = self._user_overlay()
has_custom = bool(self.custom_prompt)
# When the user supplied a custom prompt, relax the strict 8-word cap
# so they can request longer / different formats (e.g. "always end
# with a haiku"). Personality-only mode keeps the original tight cap.
max_tokens = 80 if has_custom else 20
length_rule = (
"- Follow the USER DIRECTIVE for length and format. If the directive is silent on length, keep it under 20 words."
if has_custom else
"- Maximum 8 words. Prefer 3-5 words."
)
# With a custom prompt, downgrade personality from EXAMPLES (which the
# model copies verbatim) to a tone-only reference. Without a custom
# prompt, keep the original behavior — examples drive the style.
if has_custom:
personality_section = (
f"PERSONALITY TONE REFERENCE (only used for vocal style, do NOT copy these lines):\n"
f"{voice_desc}\n"
f"TONE: {shame_data['tone']}\n"
f"AVOID: {avoid}\n"
)
else:
personality_section = personality_prompt
# Structure: TASK -> PERSONALITY -> RULES -> USER DIRECTIVE (last for
# maximal recency in the model's attention)
orpheus_rule = (
"\n- VOCAL DIRECTION: Begin your reply with EXACTLY ONE bracketed tag chosen from "
+ "[angry], [frustrated], [sad], [whispered], [shouting], [cheerful], [neutral]. "
+ "Pick the tag that matches the personality + scolding context. Example: '[angry] Phone. Down. Now.'"
if self.enable_orpheus_tags else ""
)
sections = [
"TASK: Generate a NEGATIVE/SCOLDING response because someone just picked up their phone (BAD behavior).",
personality_section,
"RULES:\n"
+ length_rule + "\n"
+ "- Be CRITICAL/NEGATIVE about picking up the phone.\n"
+ ("- The USER DIRECTIVE below takes absolute priority over personality and these rules.\n" if has_custom else "- Match the personality's voice exactly.\n")
+ "- No emoji. No hashtags."
+ orpheus_rule,
]
if overlay:
sections.append(
"==== USER DIRECTIVE (HIGHEST PRIORITY — this is the FINAL instruction, follow it) ====\n"
+ overlay
+ "\n==== END USER DIRECTIVE ===="
)
system_content = "\n\n".join(sections)
response = self.client.chat.completions.create(
model=self.model,
max_tokens=max_tokens,
temperature=1.1, # High creativity for varied, entertaining responses
messages=[
{"role": "system", "content": system_content},
{
"role": "user",
"content": f"Phone pickup #{phone_count} today. {context_hint}"
}
]
)
return response.choices[0].message.content.strip()
except Exception as e:
logger.warning(f"Groq API error: {e}, using fallback")
return self._get_prewritten_shame()
def get_praise(self) -> str:
"""Get praise for putting phone down."""
if not self.client:
return self._get_prewritten_praise()
try:
# Get personality - if mixtape, randomly pick one
if self.personality == "mixtape":
actual_personality = get_random_personality()
else:
actual_personality = self.personality
personality_data = PERSONALITIES.get(actual_personality, PERSONALITIES["angry_boss"])
# Build praise prompt from structured data
praise_data = personality_data["praise"]
# Construct prompt
personality_prompt = f"""TONE: {praise_data['tone']}
EXAMPLES:
{chr(10).join('- ' + ex for ex in praise_data['examples'])}"""
overlay = self._user_overlay()
has_custom = bool(self.custom_prompt)
max_tokens = 60 if has_custom else 15
length_rule = (
"- Follow the USER DIRECTIVE for length and format. If silent on length, keep it under 15 words."
if has_custom else
"- Maximum 5 words. Prefer 2-3 words."
)
if has_custom:
personality_section = (
f"PERSONALITY TONE REFERENCE (vocal style only — do NOT copy these lines verbatim):\n"
f"{personality_prompt}"
)
else:
personality_section = personality_prompt
orpheus_rule = (
"\n- VOCAL DIRECTION: Begin your reply with EXACTLY ONE bracketed tag chosen from "
+ "[cheerful], [happy], [excited], [whispered], [neutral]. "
+ "Pick the tag that matches the personality + praising context. Example: '[cheerful] Well done.'"
if self.enable_orpheus_tags else ""
)
sections = [
"TASK: Generate a POSITIVE/APPROVING response because someone just put their phone down (GOOD behavior).",
personality_section,
"RULES:\n"
+ length_rule + "\n"
+ "- Be POSITIVE/APPROVING about putting the phone down.\n"
+ ("- The USER DIRECTIVE below takes absolute priority.\n" if has_custom else "- Match the personality's voice exactly.\n")
+ "- No emoji."
+ orpheus_rule,
]
if overlay:
sections.append(
"==== USER DIRECTIVE (HIGHEST PRIORITY — this is the FINAL instruction, follow it) ====\n"
+ overlay
+ "\n==== END USER DIRECTIVE ===="
)
system_content = "\n\n".join(sections)
response = self.client.chat.completions.create(
model=self.model,
max_tokens=max_tokens,
temperature=0.8, # Faster, still varied
messages=[
{"role": "system", "content": system_content},
{"role": "user", "content": "Phone down."}
]
)
return response.choices[0].message.content.strip()
except Exception:
return self._get_prewritten_praise()
def _get_prewritten_shame(self) -> str:
"""Get personality-specific pre-written shame line."""
import random
# If mixtape, randomly pick a personality
if self.personality == "mixtape":
actual_personality = get_random_personality()
else:
actual_personality = self.personality
personality_data = PERSONALITIES.get(actual_personality, PERSONALITIES["angry_boss"])
prewritten = personality_data.get("prewritten_shame", [])
return random.choice(prewritten)
def _get_prewritten_praise(self) -> str:
"""Get personality-specific pre-written praise line."""
import random
# If mixtape, randomly pick a personality
if self.personality == "mixtape":
actual_personality = get_random_personality()
else:
actual_personality = self.personality
personality_data = PERSONALITIES.get(actual_personality, PERSONALITIES["angry_boss"])
prewritten = personality_data.get("prewritten_praise", [])
return random.choice(prewritten)
GROQ_TTS_MODEL_EN = "canopylabs/orpheus-v1-english"
GROQ_TTS_MODEL_AR = "canopylabs/orpheus-arabic-saudi"
GROQ_TTS_VOICES_EN = {"autumn", "diana", "hannah", "austin", "daniel", "troy"}
GROQ_TTS_DEFAULT_VOICE = "autumn"
class TextToSpeech:
"""Convert text to speech using Edge TTS (free), Groq Orpheus, or ElevenLabs.
Provider priority is controlled by `tts_provider`:
- "auto" : prefer ElevenLabs (if eleven_key works), else Groq (if groq_key), else Edge
- "groq" : Groq Orpheus only (falls back to Edge on failure)
- "elevenlabs" : ElevenLabs only (falls back to Edge on failure)
- "edge" : Edge TTS only
"""
def __init__(
self,
elevenlabs_key: str = "",
voice: str = "",
eleven_voice_id: str = "",
personality: str = "mixtape",
groq_key: str = "",
groq_tts_voice: str = "",
groq_tts_model: str = "",
tts_provider: str = "auto",
):
self.elevenlabs_key = elevenlabs_key
self.user_edge_voice = voice # User's custom Edge TTS voice (overrides personality default)
self.user_eleven_voice = eleven_voice_id # User's custom ElevenLabs voice (overrides personality default)
self.personality = personality
self.eleven_client = None
self.chars_used = 0
self.MONTHLY_LIMIT = 9000 # Leave buffer under 10k
self.working_voice_cache = {} # Cache of personality -> working voice ID
# Groq TTS state
self.groq_key = groq_key
gv = (groq_tts_voice or "").strip().lower()
self.groq_tts_voice = gv if gv in GROQ_TTS_VOICES_EN else GROQ_TTS_DEFAULT_VOICE
self.groq_tts_model = (groq_tts_model or "").strip() or GROQ_TTS_MODEL_EN
provider = (tts_provider or "auto").strip().lower()
self.tts_provider = provider if provider in {"auto", "groq", "elevenlabs", "edge"} else "auto"
if elevenlabs_key:
try:
from elevenlabs import ElevenLabs
self.eleven_client = ElevenLabs(api_key=elevenlabs_key)
logger.info(f"ElevenLabs TTS initialized (voices will be validated on first use)")
except ImportError:
logger.warning("elevenlabs package not installed, using Edge TTS")
except Exception as e:
logger.warning(f"ElevenLabs init failed: {e}, using Edge TTS")
if self.groq_key:
logger.info(
f"Groq TTS available: model={self.groq_tts_model} voice={self.groq_tts_voice}"
)
logger.info(f"TTS provider selection: {self.tts_provider}")
def _get_voice_for_personality(self):
"""Get the appropriate voice based on personality and user override."""
personality_data = PERSONALITIES.get(self.personality, PERSONALITIES["mixtape"])
# User override always wins for edge voice
edge_voice = self.user_edge_voice if self.user_edge_voice else personality_data.get("default_voice", "en-US-AnaNeural")
# For ElevenLabs voice, handle list of voices (try in order) or single voice (backward compatibility)
if self.user_eleven_voice:
# User specified a custom voice
eleven_voices = [self.user_eleven_voice]
else:
# Get from personality config - handle both list and single voice
eleven_voice_data = personality_data.get("default_eleven_voices", personality_data.get("default_eleven_voice", "21m00Tcm4TlvDq8ikWAM"))
if isinstance(eleven_voice_data, list):
eleven_voices = eleven_voice_data
else:
eleven_voices = [eleven_voice_data]
return edge_voice, eleven_voices
async def synthesize(self, text: str, output_path: str = "/tmp/judgy_reachy_tts.mp3") -> str:
"""Convert text to speech, return path to audio file."""
# Get appropriate voices for current personality
edge_voice, eleven_voices = self._get_voice_for_personality()
provider = self.tts_provider
# Explicit Edge: skip everything else
if provider == "edge":
# Strip Orpheus [tag] markers so Edge doesn't speak them verbatim
spoken = strip_orpheus_tags(text)
logger.info(f"TTS provider=edge: using Edge TTS voice={edge_voice}")
return await self._synthesize_edge(spoken, output_path, edge_voice)
# Explicit Groq: try Groq Orpheus, fall back to Edge on failure
if provider == "groq":
if self.groq_key:
try:
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
logger.info(f"TTS provider=groq: Orpheus voice={self.groq_tts_voice}")
# Pass tags through unchanged — Orpheus consumes them.
return await self._synthesize_groq(text, wav_path)
except Exception as e:
logger.warning(f"Groq TTS failed: {e}, falling back to Edge TTS")
else:
logger.warning("TTS provider=groq but no groq_key; falling back to Edge TTS")
# Fallback: must strip tags before sending to Edge
return await self._synthesize_edge(strip_orpheus_tags(text), output_path, edge_voice)
# For ElevenLabs and Edge (auto path), strip Orpheus tags so they aren't spoken verbatim
text_no_tags = strip_orpheus_tags(text)
# Auto path: ElevenLabs (if works) > Groq Orpheus (if key) > Edge (fallback).
# Explicit "elevenlabs": same as auto-with-eleven, falls through to Edge on failure.
# Try ElevenLabs first if available and under limit
if self.eleven_client and (self.chars_used + len(text_no_tags)) < self.MONTHLY_LIMIT:
# Check cache first
if self.personality in self.working_voice_cache:
try:
cached_voice = self.working_voice_cache[self.personality]
logger.info(f"Using cached ElevenLabs voice: {cached_voice}")
return await self._synthesize_elevenlabs(text_no_tags, output_path, cached_voice)
except Exception as e:
logger.warning(f"Cached voice failed: {e}, trying other voices")
# Remove from cache if it failed
del self.working_voice_cache[self.personality]
# Try each voice in the list until one works
for voice_id in eleven_voices:
try:
logger.info(f"Trying ElevenLabs voice: {voice_id}")
result = await self._synthesize_elevenlabs(text_no_tags, output_path, voice_id)
# Success! Cache this voice for future use
self.working_voice_cache[self.personality] = voice_id
logger.info(f"✓ Voice {voice_id} works! Cached for {self.personality}")
return result
except Exception as e:
logger.warning(f"Voice {voice_id} failed: {e}, trying next...")
continue
# All voices failed
logger.warning(f"All ElevenLabs voices failed for {self.personality}, falling back to Groq/Edge")
# In the auto path, prefer Groq Orpheus over Edge when groq_key is available.
# (Explicit provider="edge"/"groq"/"elevenlabs" already short-circuited above.)
if self.groq_key:
try:
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
logger.info(f"Auto-path -> Groq Orpheus voice={self.groq_tts_voice}")
# Tags pass through to Orpheus
return await self._synthesize_groq(text, wav_path)
except Exception as e:
logger.warning(f"Groq TTS failed in auto-path: {e}, falling back to Edge TTS")
# Final fallback: Edge TTS (always works, unlimited)
logger.info(f"Using Edge TTS with voice: {edge_voice}")
return await self._synthesize_edge(text_no_tags, output_path, edge_voice)
async def _synthesize_elevenlabs(self, text: str, output_path: str, voice_id: str) -> str:
"""Use ElevenLabs for high-quality voice."""
audio = self.eleven_client.text_to_speech.convert(
text=text,
voice_id=voice_id,
model_id="eleven_multilingual_v2", # Good balance of emotion and speed
)
with open(output_path, "wb") as f:
for chunk in audio:
f.write(chunk)
self.chars_used += len(text)
logger.debug(f"ElevenLabs TTS: {len(text)} chars, total: {self.chars_used}")
return output_path
async def _synthesize_edge(self, text: str, output_path: str, voice: str) -> str:
"""Use Edge TTS (free, unlimited)."""
import edge_tts
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
logger.debug(f"Edge TTS: {len(text)} chars with voice {voice}")
return output_path
async def _synthesize_groq(self, text: str, output_path: str) -> str:
"""Use Groq Orpheus TTS. Returns path to a WAV file.
Runs the blocking HTTP call in a thread so we don't stall the asyncio loop.
"""
import asyncio
import requests
url = "https://api.groq.com/openai/v1/audio/speech"
payload = {
"model": self.groq_tts_model,
"input": text,
"voice": self.groq_tts_voice,
"response_format": "wav",
}
headers = {
"Authorization": f"Bearer {self.groq_key}",
"Content-Type": "application/json",
}
def _call():
r = requests.post(url, json=payload, headers=headers, timeout=20)
r.raise_for_status()
with open(output_path, "wb") as f:
f.write(r.content)
return len(r.content)
n = await asyncio.to_thread(_call)
logger.info(f"Groq Orpheus TTS: {len(text)} chars -> {n} bytes WAV ({self.groq_tts_voice})")
return output_path