avinash-rai commited on
Commit
82a7380
·
1 Parent(s): 3b465fc

fix((extra..)

Browse files
app/config.py CHANGED
@@ -47,13 +47,15 @@ class Settings(BaseSettings):
47
  GROQ_MODEL: str = "llama-3.3-70b-versatile"
48
 
49
  # Per-task model routing (Production Grade)
 
 
50
  # Per-task model routing (Production Grade)
51
- GROQ_FAST_MODEL: str = "llama3-8b-8192" # Blazing fast (~300 tokens/sec)
52
- GROQ_SMART_MODEL: str = "llama3-70b-8192" # Balanced reasoning
53
- GROQ_NATURAL_MODEL: str = "llama3-8b-8192"
54
- GROQ_STRUCTURED_MODEL: str = "llama3-70b-8192" # Reliable JSON
55
- GROQ_SAFETY_MODEL: str = "llama-guard-3-8b"
56
- GROQ_SAFEGUARD_MODEL: str = "llama-guard-3-8b"
57
 
58
  OPENROUTER_MODEL: str = "meta-llama/llama-3.1-70b-instruct"
59
 
 
47
  GROQ_MODEL: str = "llama-3.3-70b-versatile"
48
 
49
  # Per-task model routing (Production Grade)
50
+ MAX_RETRIES: int = 2
51
+
52
  # Per-task model routing (Production Grade)
53
+ GROQ_FAST_MODEL: str = "llama-3.1-8b-instant" # Blazing fast (~300 tokens/sec)
54
+ GROQ_SMART_MODEL: str = "llama-3.3-70b-versatile" # Balanced reasoning
55
+ GROQ_NATURAL_MODEL: str = "moonshotai/kimi-k2-instruct-0905"
56
+ GROQ_STRUCTURED_MODEL: str = "llama-3.3-70b-versatile" # Reliable JSON
57
+ GROQ_SAFETY_MODEL: str = "meta-llama/llama-guard-4-12b"
58
+ GROQ_SAFEGUARD_MODEL: str = "openai/gpt-oss-safeguard-20b"
59
 
60
  OPENROUTER_MODEL: str = "meta-llama/llama-3.1-70b-instruct"
61
 
app/core/llm_client.py CHANGED
@@ -467,22 +467,36 @@ class GroqClient(BaseLLMClient):
467
  async def _log_rate_limit_telemetry(self, headers: Dict[str, str]):
468
  """EXTRACT & TRACK REAL-TIME QUOTAS (Aligned with GroqDocs)."""
469
  try:
 
 
 
 
 
470
  rem_req = headers.get("x-ratelimit-remaining-requests")
471
  rem_tok = headers.get("x-ratelimit-remaining-tokens")
472
 
473
- if rem_req:
474
- self.remaining_requests = int(rem_req)
475
- if rem_tok:
476
- self.remaining_tokens = int(rem_tok)
477
 
478
  self.reset_requests = headers.get("x-ratelimit-reset-requests", self.reset_requests)
479
  self.reset_tokens = headers.get("x-ratelimit-reset-tokens", self.reset_tokens)
480
 
481
- # Intelligent Warning: If we are below 10% of tokens, print a SOC alert
482
- if self.remaining_tokens < 6000: # One full context window (small)
483
- print(f" [ALERT] SOC ALERT: Low Token Quota ({self.remaining_tokens} tokens left). Reset in {self.reset_tokens}", flush=True)
484
- elif self.remaining_requests < 50:
485
- print(f" [ALERT] SOC ALERT: Low Daily Request Quota ({self.remaining_requests} left). Reset in {self.reset_requests}", flush=True)
 
 
 
 
 
 
 
 
 
 
 
486
  except (ValueError, TypeError):
487
  pass
488
 
@@ -777,19 +791,33 @@ class GroqClient(BaseLLMClient):
777
  "max_tokens": max_tokens
778
  }
779
  if kwargs.get("stop"): payload["stop"] = kwargs["stop"]
 
780
  if enabled_tools:
781
- payload["compound_custom"] = {"tools": {"enabled_tools": enabled_tools}}
 
 
 
 
 
 
782
 
783
  # JSON Mode Handling
784
  if json_mode:
785
  if model_registry.supports(current_model, Capability.JSON_OBJECT):
786
  payload["response_format"] = {"type": "json_object"}
787
- if "json" not in prompt.lower():
788
- payload["messages"][0]["content"] += "\n\n(Respond in JSON)"
 
 
 
 
 
 
789
  else:
790
  # Raw fallback logic for JSON
791
- if "json" not in prompt.lower():
792
- payload["messages"][0]["content"] += "\n\nCRITICAL: Respond ONLY with a valid JSON object."
 
793
 
794
  # Reasoning Optimization
795
  if is_reasoning_model:
 
467
  async def _log_rate_limit_telemetry(self, headers: Dict[str, str]):
468
  """EXTRACT & TRACK REAL-TIME QUOTAS (Aligned with GroqDocs)."""
469
  try:
470
+ # Limits (Capacity)
471
+ limit_req = headers.get("x-ratelimit-limit-requests")
472
+ limit_tok = headers.get("x-ratelimit-limit-tokens")
473
+
474
+ # Remaining (State)
475
  rem_req = headers.get("x-ratelimit-remaining-requests")
476
  rem_tok = headers.get("x-ratelimit-remaining-tokens")
477
 
478
+ if rem_req: self.remaining_requests = int(rem_req)
479
+ if rem_tok: self.remaining_tokens = int(rem_tok)
 
 
480
 
481
  self.reset_requests = headers.get("x-ratelimit-reset-requests", self.reset_requests)
482
  self.reset_tokens = headers.get("x-ratelimit-reset-tokens", self.reset_tokens)
483
 
484
+ # Smart Alerting: Calculate utilization if limits are available
485
+ if limit_tok and rem_tok:
486
+ l_tok = float(limit_tok)
487
+ r_tok = float(rem_tok)
488
+ if l_tok > 0 and (r_tok / l_tok) < 0.2: # Less than 20% remaining
489
+ print(f" [ALERT] SOC ALERT: High Token Load ({int(r_tok)}/{int(l_tok)} TPM left). Reset in {self.reset_tokens}", flush=True)
490
+
491
+ elif self.remaining_tokens < 1000: # Fallback absolute floor
492
+ print(f" [ALERT] SOC ALERT: Critical Token Quota ({self.remaining_tokens} left).", flush=True)
493
+
494
+ if limit_req and rem_req:
495
+ l_req = float(limit_req)
496
+ r_req = float(rem_req)
497
+ if l_req > 0 and (r_req / l_req) < 0.1: # Less than 10% daily requests remaining
498
+ print(f" [ALERT] SOC ALERT: Daily Request Limits Critical ({int(r_req)}/{int(l_req)} RPD left).", flush=True)
499
+
500
  except (ValueError, TypeError):
501
  pass
502
 
 
791
  "max_tokens": max_tokens
792
  }
793
  if kwargs.get("stop"): payload["stop"] = kwargs["stop"]
794
+ # 🔒 COMPOUND CUSTOM: Only for Groq Compound models
795
  if enabled_tools:
796
+ is_compound = "compound" in current_model.lower()
797
+ if is_compound:
798
+ payload["compound_custom"] = {"tools": {"enabled_tools": enabled_tools}}
799
+ else:
800
+ # For standard models, we ignore enabled_tools as passed here (list of strings)
801
+ # because standard models require full tool definitions.
802
+ pass
803
 
804
  # JSON Mode Handling
805
  if json_mode:
806
  if model_registry.supports(current_model, Capability.JSON_OBJECT):
807
  payload["response_format"] = {"type": "json_object"}
808
+ # Only append hint if not already present
809
+ if "json" not in prompt.lower() and "json" not in payload["messages"][-1]["content"].lower():
810
+ # Check if user message is last, append to it
811
+ if payload["messages"][-1]["role"] == "user":
812
+ payload["messages"][-1]["content"] += "\n\n(Respond in JSON)"
813
+ else:
814
+ # Append system instruction if last msg is not user
815
+ payload["messages"].append({"role": "user", "content": "(Respond in JSON)"})
816
  else:
817
  # Raw fallback logic for JSON
818
+ if "json" not in prompt.lower() and "json" not in payload["messages"][-1]["content"].lower():
819
+ if payload["messages"][-1]["role"] == "user":
820
+ payload["messages"][-1]["content"] += "\n\nCRITICAL: Respond ONLY with a valid JSON object."
821
 
822
  # Reasoning Optimization
823
  if is_reasoning_model:
app/core/model_registry.py CHANGED
@@ -50,7 +50,8 @@ class ModelRegistry:
50
  "llama-3.3-70b-versatile": {
51
  "provider": "groq",
52
  "capabilities": [
53
- Capability.TOOLS, Capability.PARALLEL_TOOLS, Capability.REMOTE_MCP
 
54
  ],
55
  "role": "SMART_REASONING",
56
  "description": "Versatile reasoning and official JSON Schema support",
 
50
  "llama-3.3-70b-versatile": {
51
  "provider": "groq",
52
  "capabilities": [
53
+ Capability.TOOLS, Capability.PARALLEL_TOOLS, Capability.REMOTE_MCP,
54
+ Capability.JSON_OBJECT, Capability.JSON_SCHEMA
55
  ],
56
  "role": "SMART_REASONING",
57
  "description": "Versatile reasoning and official JSON Schema support",