""" Inference Script - Inventory Optimization Environment ===================================================== Required env vars: API_BASE_URL The API endpoint for the LLM. MODEL_NAME The model identifier to use for inference. HF_TOKEN Hugging Face token (preferred for HF Router). Supported key env vars (first non-empty wins): HF_TOKEN, API_KEY, OPENAI_API_KEY. For non-OpenAI endpoints, a dummy key is used when no key is provided because the OpenAI Python SDK requires a non-empty api_key argument. """ import os import json import textwrap from dotenv import load_dotenv load_dotenv() from openai import OpenAI from server.inventory_env import InventoryEnvironment from server.constants import EXTRA_INVENTORY_COST, EVENT_DURATION, TASKS, COST_PRICES, SHIPPING_COST, BASE_PRICES from models import InventoryAction API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" API_KEY = os.getenv("API_KEY") or os.getenv("HF_TOKEN") or os.getenv("OPENAI_API_KEY") MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen3-32B" TASK_NAME = os.getenv("TASK_NAME") or "easy" MAX_DAYS = 30 SYSTEM_PROMPT = textwrap.dedent(""" You are an inventory management AI agent. Each day you receive the current state of a retail store with 5 products: electronics, clothing, groceries, furniture, toys. You will be shown your decision history from recent days so you can learn from past outcomes. Use this history to spot demand trends, identify what worked vs. what didn't, and adjust your strategy accordingly. Groceries are perishable (5-day shelf life). Other products don't expire. Product selling prices: electronics=$150, clothing=$40, groceries=$10, furniture=$200, toys=$25 Product cost prices: electronics=$100, clothing=$25, groceries=$5, furniture=$130, toys=$12 Profit margins: electronics=$50, clothing=$15, groceries=$5, furniture=$70, toys=$13 Shipping costs per unit: slow=$2 (3-7 days), medium=$5 (2-4 days), fast=$10 (1 day, always reliable) Warehouse capacity: electronics=100, clothing=200, groceries=500, furniture=50, toys=300 Events (like black_friday, christmas) boost demand when their countdown hits 0 and last for 2 days. Weekends (day%7 == 5 or 6) have 1.2x demand. CRITICAL STRATEGY: - Review your history: if reward was negative, identify why and change approach. - Track demand trends across days. - You MUST restock products when inventory is low. Missed sales = lost revenue = negative reward. - Do NOT overbuy when demand is low — unsold stock ties up cash and perishables expire. - Stock up BEFORE events hit (check event countdowns — order 3-5 days ahead). - When no events are approaching, slow shipping is often sufficient and saves significant cost. - Near end of episode (last 2 days), stop buying — focus on selling remaining stock. DYNAMIC PRICING: You can set a price multiplier (0.5 to 1.5) per product each day. Default is 1.0. - Lower price (e.g. 0.7) = more demand but less revenue per unit. Good for clearing excess stock. - Higher price (e.g. 1.3) = less demand but more revenue per unit. Good when stock is low. - Price elasticity varies across different products. - Elasticity values: electronics=1.2, clothing=1.5, groceries=0.4, furniture=0.8, toys=1.3 Each day you must respond with a JSON action: { "buy_quantities": {"product_name": quantity, ...}, "delivery_method": "slow" | "medium" | "fast", "liquidate": {"product_name": quantity, ...}, "price_multipliers": {"product_name": multiplier, ...} } - buy_quantities: products and amounts to order. - delivery_method: shipping speed for this order - liquidate: products and amounts to dispose of (no revenue, empty {} to skip) Use liquidate to free up warehouse space before a restock. - price_multipliers: set selling price multiplier per product (0.5-1.5, default 1.0 if omitted) LEARNING FROM HISTORY: - Compare your past buy quantities to the demand that followed — were you over or under? - If you see repeated stockouts for a product, increase orders for it. - If groceries expired, you overbought — reduce grocery orders or use faster shipping. - A negative reward means your last action was bad — adjust immediately. Before responding with JSON, briefly reason (2-3 lines max): 1. What did I learn from recent history? What went wrong/right? 2. What products need restocking vs. are overstocked? 3. Are any events approaching? Then output ONLY the final JSON action on the last line. """).strip() def format_observation(obs): """Convert observation into a readable prompt for the LLM.""" # format inventory with batch detail, remaining capacity, and extra cost inv_lines = [] for product, batches in obs.updated_inventory.items(): total = sum(b[0] for b in batches) remaining = obs.remaining_capacity.get(product, 0) extra_cost = EXTRA_INVENTORY_COST.get(product, 0) batch_detail = ", ".join( f"{b[0]} units" + (f" ({b[1]}d left)" if b[1] is not None else "") for b in batches ) inv_lines.append(f" {product}: {total} total [{batch_detail}] | space left: {remaining} (extra space: ${extra_cost}/unit)") inv_text = "\n".join(inv_lines) # format events event_lines = [] for event, days in obs.updated_events.items(): if days > 0: event_lines.append(f" {event}: in {days} days") elif -EVENT_DURATION < days <= 0: event_lines.append(f" {event}: ACTIVE NOW") else: event_lines.append(f" {event}: ended") events_text = "\n".join(event_lines) if event_lines else " None" # format deliveries delivery_lines = [] for delivery in obs.updated_deliveries: for product, shipment in delivery.items(): qty, arrival_day = shipment days_away = arrival_day - obs.current_day delivery_lines.append(f" {product}: {qty} units arriving in {days_away} days") deliveries_text = "\n".join(delivery_lines) if delivery_lines else " None" # format demand (yesterday's demand — feedback, not prediction) demand_lines = [] for product, units in obs.demand_today.items(): demand_lines.append(f" {product}: {units} units") demand_text = "\n".join(demand_lines) if demand_lines else " No demand data yet" prompt = f"""Day: {obs.current_day}/{MAX_DAYS} Cash: ${obs.total_cash:.2f} Day Profit: ${obs.day_profit:.2f} Total Profit: ${obs.total_profit:.2f} Last Step Reward: {obs.reward:.3f} Inventory: {inv_text} Yesterday's Demand: {demand_text} Upcoming Events: {events_text} Pending Deliveries: {deliveries_text} Respond with your action as JSON.""" return prompt def parse_action(response_text): """Parse LLM response into InventoryAction. Extracts JSON even if surrounded by text.""" try: text = response_text.strip() # strip markdown code fences if "```" in text: parts = text.split("```") for part in parts: part = part.strip() if part.startswith("json"): part = part[4:].strip() if part.startswith("{"): text = part break # find the first { and last } to extract JSON start = text.find("{") end = text.rfind("}") if start != -1 and end != -1 and end > start: text = text[start:end + 1] data = json.loads(text) # only keep valid fields clean = {} if "buy_quantities" in data: clean["buy_quantities"] = data["buy_quantities"] if "delivery_method" in data: clean["delivery_method"] = data["delivery_method"] if "liquidate" in data: clean["liquidate"] = data["liquidate"] if "price_multipliers" in data: clean["price_multipliers"] = data["price_multipliers"] return InventoryAction(**clean) except Exception as e: print(f" [DEBUG] Parse FAILED: {e}") print(f" [DEBUG] Raw LLM response: {response_text[:500]}") return InventoryAction( buy_quantities={}, delivery_method="slow", liquidate={}, price_multipliers={}, ) HISTORY_WINDOW = 7 # rolling window of past days to include in context def run_task(client, task_name): """Run a single task and return total profit.""" env = InventoryEnvironment(task_name) obs = env.reset() rewards = [] steps_taken = 0 success = False print(f"[START] task={task_name} env=inventory_env model={MODEL_NAME}", flush=True) # Rolling history of (user_observation, assistant_response) pairs history = [] try: for day in range(1, env.max_days + 1): if obs.done: break user_prompt = format_observation(obs) # Build messages: system + history context + current observation messages = [{"role": "system", "content": SYSTEM_PROMPT}] recent = history[-HISTORY_WINDOW:] if recent: messages.append({ "role": "user", "content": f"Here is your decision history from the last {len(recent)} day(s). " "Use this to identify demand trends, adjust restocking, and avoid repeating mistakes.", }) messages.append({ "role": "assistant", "content": "Understood. I'll review my past decisions and their outcomes to make better choices today.", }) for past_user, past_assistant in recent: messages.append({"role": "user", "content": past_user}) messages.append({"role": "assistant", "content": past_assistant}) messages.append({"role": "user", "content": user_prompt}) error = None try: completion = client.chat.completions.create( model=MODEL_NAME, messages=messages, temperature=0.0, max_completion_tokens=500, stream=False, ) response_text = completion.choices[0].message.content or "" except Exception as exc: error = str(exc) response_text = "{}" # Save this turn to rolling history history.append((user_prompt, response_text)) action = parse_action(response_text) action_str = json.dumps({"buy": action.buy_quantities, "deliver": action.delivery_method, "liquidate": action.liquidate, "prices": action.price_multipliers}) obs = env.step(action) reward = obs.reward done = obs.done rewards.append(reward) steps_taken = day print(f"[STEP] step={day} action={action_str} reward={reward:.2f} done={str(done).lower()} error={error if error else 'null'}", flush=True) if done: break # compute score from server.grader import grade score = grade(task_name, obs.total_profit) success = score >= 0.1 finally: rewards_str = ",".join(f"{r:.2f}" for r in rewards) print(f"[END] success={str(success).lower()} steps={steps_taken} score={score:.3f} rewards={rewards_str}", flush=True) return obs.total_profit def main(): from server.grader import grade, compute_baselines if not MODEL_NAME: raise RuntimeError("MODEL_NAME is not set. Please export MODEL_NAME before running inference.") client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) tasks = ["easy", "medium", "hard"] # print baselines print(f"\n{'=' * 50}") print("BASELINES") print(f"{'=' * 50}") for task_name in tasks: floor, ceiling = compute_baselines(task_name) print(f" {task_name}: floor=${floor:.2f} (passive) | ceiling=${ceiling:.2f} (heuristic)") results = {} for task_name in tasks: profit = run_task(client, task_name) results[task_name] = profit print(f"\n{'=' * 50}") print("FINAL SCORES") print(f"{'=' * 50}") for task_name in tasks: floor, ceiling = compute_baselines(task_name) score = grade(task_name, results[task_name]) print(f" {task_name}: {score:.3f} (profit: ${results[task_name]:.2f} | floor: ${floor:.2f} | ceiling: ${ceiling:.2f})") if __name__ == "__main__": main()