Commit ·
82a7380
1
Parent(s): 3b465fc
fix((extra..)
Browse files- app/config.py +8 -6
- app/core/llm_client.py +42 -14
- app/core/model_registry.py +2 -1
app/config.py
CHANGED
|
@@ -47,13 +47,15 @@ class Settings(BaseSettings):
|
|
| 47 |
GROQ_MODEL: str = "llama-3.3-70b-versatile"
|
| 48 |
|
| 49 |
# Per-task model routing (Production Grade)
|
|
|
|
|
|
|
| 50 |
# Per-task model routing (Production Grade)
|
| 51 |
-
GROQ_FAST_MODEL: str = "
|
| 52 |
-
GROQ_SMART_MODEL: str = "
|
| 53 |
-
GROQ_NATURAL_MODEL: str = "
|
| 54 |
-
GROQ_STRUCTURED_MODEL: str = "
|
| 55 |
-
GROQ_SAFETY_MODEL: str = "llama-guard-
|
| 56 |
-
GROQ_SAFEGUARD_MODEL: str = "
|
| 57 |
|
| 58 |
OPENROUTER_MODEL: str = "meta-llama/llama-3.1-70b-instruct"
|
| 59 |
|
|
|
|
| 47 |
GROQ_MODEL: str = "llama-3.3-70b-versatile"
|
| 48 |
|
| 49 |
# Per-task model routing (Production Grade)
|
| 50 |
+
MAX_RETRIES: int = 2
|
| 51 |
+
|
| 52 |
# Per-task model routing (Production Grade)
|
| 53 |
+
GROQ_FAST_MODEL: str = "llama-3.1-8b-instant" # Blazing fast (~300 tokens/sec)
|
| 54 |
+
GROQ_SMART_MODEL: str = "llama-3.3-70b-versatile" # Balanced reasoning
|
| 55 |
+
GROQ_NATURAL_MODEL: str = "moonshotai/kimi-k2-instruct-0905"
|
| 56 |
+
GROQ_STRUCTURED_MODEL: str = "llama-3.3-70b-versatile" # Reliable JSON
|
| 57 |
+
GROQ_SAFETY_MODEL: str = "meta-llama/llama-guard-4-12b"
|
| 58 |
+
GROQ_SAFEGUARD_MODEL: str = "openai/gpt-oss-safeguard-20b"
|
| 59 |
|
| 60 |
OPENROUTER_MODEL: str = "meta-llama/llama-3.1-70b-instruct"
|
| 61 |
|
app/core/llm_client.py
CHANGED
|
@@ -467,22 +467,36 @@ class GroqClient(BaseLLMClient):
|
|
| 467 |
async def _log_rate_limit_telemetry(self, headers: Dict[str, str]):
|
| 468 |
"""EXTRACT & TRACK REAL-TIME QUOTAS (Aligned with GroqDocs)."""
|
| 469 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
rem_req = headers.get("x-ratelimit-remaining-requests")
|
| 471 |
rem_tok = headers.get("x-ratelimit-remaining-tokens")
|
| 472 |
|
| 473 |
-
if rem_req:
|
| 474 |
-
|
| 475 |
-
if rem_tok:
|
| 476 |
-
self.remaining_tokens = int(rem_tok)
|
| 477 |
|
| 478 |
self.reset_requests = headers.get("x-ratelimit-reset-requests", self.reset_requests)
|
| 479 |
self.reset_tokens = headers.get("x-ratelimit-reset-tokens", self.reset_tokens)
|
| 480 |
|
| 481 |
-
#
|
| 482 |
-
if
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 486 |
except (ValueError, TypeError):
|
| 487 |
pass
|
| 488 |
|
|
@@ -777,19 +791,33 @@ class GroqClient(BaseLLMClient):
|
|
| 777 |
"max_tokens": max_tokens
|
| 778 |
}
|
| 779 |
if kwargs.get("stop"): payload["stop"] = kwargs["stop"]
|
|
|
|
| 780 |
if enabled_tools:
|
| 781 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 782 |
|
| 783 |
# JSON Mode Handling
|
| 784 |
if json_mode:
|
| 785 |
if model_registry.supports(current_model, Capability.JSON_OBJECT):
|
| 786 |
payload["response_format"] = {"type": "json_object"}
|
| 787 |
-
|
| 788 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 789 |
else:
|
| 790 |
# Raw fallback logic for JSON
|
| 791 |
-
if "json" not in prompt.lower():
|
| 792 |
-
payload["messages"][
|
|
|
|
| 793 |
|
| 794 |
# Reasoning Optimization
|
| 795 |
if is_reasoning_model:
|
|
|
|
| 467 |
async def _log_rate_limit_telemetry(self, headers: Dict[str, str]):
|
| 468 |
"""EXTRACT & TRACK REAL-TIME QUOTAS (Aligned with GroqDocs)."""
|
| 469 |
try:
|
| 470 |
+
# Limits (Capacity)
|
| 471 |
+
limit_req = headers.get("x-ratelimit-limit-requests")
|
| 472 |
+
limit_tok = headers.get("x-ratelimit-limit-tokens")
|
| 473 |
+
|
| 474 |
+
# Remaining (State)
|
| 475 |
rem_req = headers.get("x-ratelimit-remaining-requests")
|
| 476 |
rem_tok = headers.get("x-ratelimit-remaining-tokens")
|
| 477 |
|
| 478 |
+
if rem_req: self.remaining_requests = int(rem_req)
|
| 479 |
+
if rem_tok: self.remaining_tokens = int(rem_tok)
|
|
|
|
|
|
|
| 480 |
|
| 481 |
self.reset_requests = headers.get("x-ratelimit-reset-requests", self.reset_requests)
|
| 482 |
self.reset_tokens = headers.get("x-ratelimit-reset-tokens", self.reset_tokens)
|
| 483 |
|
| 484 |
+
# Smart Alerting: Calculate utilization if limits are available
|
| 485 |
+
if limit_tok and rem_tok:
|
| 486 |
+
l_tok = float(limit_tok)
|
| 487 |
+
r_tok = float(rem_tok)
|
| 488 |
+
if l_tok > 0 and (r_tok / l_tok) < 0.2: # Less than 20% remaining
|
| 489 |
+
print(f" [ALERT] SOC ALERT: High Token Load ({int(r_tok)}/{int(l_tok)} TPM left). Reset in {self.reset_tokens}", flush=True)
|
| 490 |
+
|
| 491 |
+
elif self.remaining_tokens < 1000: # Fallback absolute floor
|
| 492 |
+
print(f" [ALERT] SOC ALERT: Critical Token Quota ({self.remaining_tokens} left).", flush=True)
|
| 493 |
+
|
| 494 |
+
if limit_req and rem_req:
|
| 495 |
+
l_req = float(limit_req)
|
| 496 |
+
r_req = float(rem_req)
|
| 497 |
+
if l_req > 0 and (r_req / l_req) < 0.1: # Less than 10% daily requests remaining
|
| 498 |
+
print(f" [ALERT] SOC ALERT: Daily Request Limits Critical ({int(r_req)}/{int(l_req)} RPD left).", flush=True)
|
| 499 |
+
|
| 500 |
except (ValueError, TypeError):
|
| 501 |
pass
|
| 502 |
|
|
|
|
| 791 |
"max_tokens": max_tokens
|
| 792 |
}
|
| 793 |
if kwargs.get("stop"): payload["stop"] = kwargs["stop"]
|
| 794 |
+
# 🔒 COMPOUND CUSTOM: Only for Groq Compound models
|
| 795 |
if enabled_tools:
|
| 796 |
+
is_compound = "compound" in current_model.lower()
|
| 797 |
+
if is_compound:
|
| 798 |
+
payload["compound_custom"] = {"tools": {"enabled_tools": enabled_tools}}
|
| 799 |
+
else:
|
| 800 |
+
# For standard models, we ignore enabled_tools as passed here (list of strings)
|
| 801 |
+
# because standard models require full tool definitions.
|
| 802 |
+
pass
|
| 803 |
|
| 804 |
# JSON Mode Handling
|
| 805 |
if json_mode:
|
| 806 |
if model_registry.supports(current_model, Capability.JSON_OBJECT):
|
| 807 |
payload["response_format"] = {"type": "json_object"}
|
| 808 |
+
# Only append hint if not already present
|
| 809 |
+
if "json" not in prompt.lower() and "json" not in payload["messages"][-1]["content"].lower():
|
| 810 |
+
# Check if user message is last, append to it
|
| 811 |
+
if payload["messages"][-1]["role"] == "user":
|
| 812 |
+
payload["messages"][-1]["content"] += "\n\n(Respond in JSON)"
|
| 813 |
+
else:
|
| 814 |
+
# Append system instruction if last msg is not user
|
| 815 |
+
payload["messages"].append({"role": "user", "content": "(Respond in JSON)"})
|
| 816 |
else:
|
| 817 |
# Raw fallback logic for JSON
|
| 818 |
+
if "json" not in prompt.lower() and "json" not in payload["messages"][-1]["content"].lower():
|
| 819 |
+
if payload["messages"][-1]["role"] == "user":
|
| 820 |
+
payload["messages"][-1]["content"] += "\n\nCRITICAL: Respond ONLY with a valid JSON object."
|
| 821 |
|
| 822 |
# Reasoning Optimization
|
| 823 |
if is_reasoning_model:
|
app/core/model_registry.py
CHANGED
|
@@ -50,7 +50,8 @@ class ModelRegistry:
|
|
| 50 |
"llama-3.3-70b-versatile": {
|
| 51 |
"provider": "groq",
|
| 52 |
"capabilities": [
|
| 53 |
-
Capability.TOOLS, Capability.PARALLEL_TOOLS, Capability.REMOTE_MCP
|
|
|
|
| 54 |
],
|
| 55 |
"role": "SMART_REASONING",
|
| 56 |
"description": "Versatile reasoning and official JSON Schema support",
|
|
|
|
| 50 |
"llama-3.3-70b-versatile": {
|
| 51 |
"provider": "groq",
|
| 52 |
"capabilities": [
|
| 53 |
+
Capability.TOOLS, Capability.PARALLEL_TOOLS, Capability.REMOTE_MCP,
|
| 54 |
+
Capability.JSON_OBJECT, Capability.JSON_SCHEMA
|
| 55 |
],
|
| 56 |
"role": "SMART_REASONING",
|
| 57 |
"description": "Versatile reasoning and official JSON Schema support",
|