Is there any special design on agentical framwork? memory/planning? I only got 23% score on OSWorld

#5
by Wenjin0421 - opened

Hi Hcomany,

Thanks for sharing the model! I self-deployed via vllm. It works well to estimate the coordination of UI element with structure output. I implemented a simple agent flow via OSWorld framework but only get 23% success rate. I attached holo3_agent.py with predict implementaion for your reference. Any suggestion to reproduce the OSWorld experiment. Thanks!

import base64
import json
import logging
import re
import time
import os
from io import BytesIO
from typing import Dict, List, Tuple

import openai
from PIL import Image
from pydantic import BaseModel, Field

logger = None

MAX_RETRY_TIMES = 5


def encode_image(image_content):
    """Raw base64 encode โ€” no resize (official best practice)."""
    return base64.b64encode(image_content).decode("utf-8")


# --- Structured output schemas for guaranteed JSON parsing ---

class ComputerUseAction(BaseModel):
    """Schema for structured_outputs: the model returns exactly this JSON."""
    action: str = Field(description="The action to perform: left_click, right_click, middle_click, double_click, type, key, mouse_move, left_click_drag, scroll, wait, terminate")
    coordinate: List[int] = Field(default=None, description="[x, y] coordinates in 0-1000 normalized space")
    keys: List[str] = Field(default=None, description="Keys to press (for action=key)")
    text: str = Field(default=None, description="Text to type (for action=type)")
    pixels: int = Field(default=None, description="Scroll amount (for action=scroll)")
    time: float = Field(default=None, description="Seconds to wait (for action=wait)")
    status: str = Field(default=None, description="Task status: success or failure (for action=terminate)")


class Holo3Agent:

    def __init__(
        self,
        platform: str = "ubuntu",
        model: str = "holo3-35B-A3B",
        max_tokens: int = 1024,
        top_p: float = 0.9,
        temperature: float = 0.0,
        action_space: str = "pyautogui",
        observation_type: str = "screenshot",
        history_n: int = 1,
        add_thought_prefix: bool = False,
        coordinate_type: str = "relative",
        base_url: str = "http://localhost:8000/v1",
        api_key: str = "EMPTY",
        provider_name: str = "aws",
        screen_size: Tuple[int, int] = (1920, 1080),
        **kwargs,
    ):
        self.platform = platform
        self.model = model
        self.max_tokens = max_tokens
        self.top_p = top_p
        self.temperature = temperature
        self.action_space = action_space
        self.observation_type = observation_type
        self.history_n = history_n
        self.add_thought_prefix = add_thought_prefix
        self.coordinate_type = coordinate_type
        self.base_url = base_url
        self.api_key = api_key
        self.provider_name = provider_name
        self.screen_size = screen_size

        assert action_space in ["pyautogui"], "Invalid action space"
        assert observation_type in ["screenshot"], "Invalid observation type"

        self.thoughts = []
        self.actions = []
        self.observations = []
        self.responses = []
        self.screenshots = []
        self.last_error = None  # Error feedback for next step

    def predict(self, instruction: str, obs: Dict) -> List:
        """
        Predict the next action(s) based on the current observation.
        Returns (response, pyautogui_code).
        """
        screenshot_bytes = obs["screenshot"]

        image = Image.open(BytesIO(screenshot_bytes))
        width, height = image.size
        print(f"Screen resolution: {width}x{height}")

        # No resize โ€” send raw screenshot (official best practice)
        screenshot_b64 = encode_image(screenshot_bytes)
        self.screenshots.append(screenshot_b64)

        current_step = len(self.actions)
        history_start_idx = max(0, current_step - self.history_n)

        previous_actions = []
        for i in range(history_start_idx):
            if i < len(self.actions):
                previous_actions.append(f"Step {i+1}: {self.actions[i]}")
        previous_actions_str = (
            "\n".join(previous_actions) if previous_actions else "None"
        )

        system_prompt = (
            "You are a computer use agent that follows instructions and performs desktop computer tasks.\n"
            "You have good knowledge of computers and the internet.\n"
            "For each step, you will get a screenshot of the computer screen and you will predict the next action.\n\n"
            "ENVIRONMENT:\n"
            "* This is an Ubuntu desktop GUI with internet access.\n"
            "* You must click on desktop icons to start applications.\n"
            "* Some applications may take time to start or process actions, so you may need to wait and observe.\n"
            "* When viewing a page, make sure you scroll down to see everything before deciding something isn't available.\n"
            "* The screen uses a 1000x1000 coordinate system. All coordinates must be in this 0-1000 range.\n"
            "* Click buttons and icons in their center, not on edges.\n"
            "* If a click didn't work, try adjusting the coordinates slightly.\n"
            "* The computer's password is 'osworld-public-evaluation' if you need sudo rights.\n\n"
            "ACTIONS (output as JSON with 'action' field and relevant parameters):\n"
            "- left_click: click at coordinate [x, y]\n"
            "- right_click: right-click at coordinate [x, y]\n"
            "- double_click: double-click at coordinate [x, y]\n"
            "- middle_click: middle-click at coordinate [x, y]\n"
            "- mouse_move: move cursor to coordinate [x, y]\n"
            "- left_click_drag: drag to coordinate [x, y]\n"
            "- type: type text string (set 'text' field)\n"
            "- key: press key combination (set 'keys' field, e.g. ['ctrl', 'c'])\n"
            "- scroll: scroll by amount (set 'pixels' field, positive=up, negative=down)\n"
            "- wait: wait for changes (set 'time' field in seconds)\n"
            "- terminate: end the task (set 'status' to 'success' or 'failure')\n\n"
            "RULES:\n"
            "* First, briefly reflect on the current screenshot and what has been done so far.\n"
            "* Then decide the single best next action.\n"
            "* Be careful to ensure coordinates are correct by examining the screenshot closely.\n"
            "* Do NOT terminate until you have visually confirmed the task is fully complete in the screenshot.\n"
            "* Do NOT easily declare failure. Try your best to complete the task.\n"
            "* Only terminate with status='success' when you can see clear evidence the task goal has been achieved.\n"
        )

        instruction_prompt = (
            f"Task: {instruction}\n\n"
            f"Previous actions:\n{previous_actions_str}\n\n"
        )
        if self.last_error:
            instruction_prompt += (
                f"ERROR from previous step: {self.last_error}\n"
                f"Please correct your action based on this error.\n\n"
            )
            self.last_error = None  # Clear after injecting
        instruction_prompt += (
            f"Look at the current screenshot carefully. Reflect on what you see and what has been done so far, "
            f"then predict the next action to complete the task."
        )

        # Build messages
        messages = [
            {"role": "system", "content": [{"type": "text", "text": system_prompt}]}
        ]

        history_len = min(self.history_n, len(self.responses))
        if history_len > 0:
            history_responses = self.responses[-history_len:]
            history_screenshots = self.screenshots[-history_len - 1 : -1]

            for idx in range(history_len):
                if idx < len(history_screenshots):
                    img_url = f"data:image/png;base64,{history_screenshots[idx]}"
                    if idx == 0:
                        messages.append({
                            "role": "user",
                            "content": [
                                {"type": "image_url", "image_url": {"url": img_url}},
                                {"type": "text", "text": instruction_prompt},
                            ],
                        })
                    else:
                        messages.append({
                            "role": "user",
                            "content": [
                                {"type": "image_url", "image_url": {"url": img_url}},
                            ],
                        })

                messages.append({
                    "role": "assistant",
                    "content": [{"type": "text", "text": history_responses[idx]}],
                })

            messages.append({
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
                ],
            })
        else:
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
                    {"type": "text", "text": instruction_prompt},
                ],
            })

        # --- Log input prompt (text parts only, no base64 images) ---
        prompt_log_lines = []
        for msg in messages:
            role = msg.get("role", "")
            content = msg.get("content", [])
            if isinstance(content, str):
                prompt_log_lines.append(f"[{role}] {content}")
            elif isinstance(content, list):
                for part in content:
                    if isinstance(part, dict):
                        if part.get("type") == "text":
                            prompt_log_lines.append(f"[{role}] {part['text']}")
                        elif part.get("type") == "image_url":
                            prompt_log_lines.append(f"[{role}] <image>")
        input_prompt_text = "\n".join(prompt_log_lines)
        logger.info(f"[Step {current_step + 1}] === INPUT PROMPT ===\n{input_prompt_text}")

        response_text, action_data = self.call_llm(messages)

        # --- Log output response ---
        logger.info(f"[Step {current_step + 1}] === MODEL OUTPUT ===\n{response_text}")
        logger.info(f"[Step {current_step + 1}] Parsed action: {json.dumps(action_data) if action_data else 'None'}")

        self.responses.append(response_text)

        pyautogui_code = self.action_to_pyautogui(action_data, width, height)

        if self.last_error:
            logger.warning(f"[Step {current_step + 1}] ACTION ERROR: {self.last_error}")

        low_level_instruction = ""
        if action_data:
            action_name = action_data.get("action", "unknown")
            coord = action_data.get("coordinate")
            if coord:
                adj_x, adj_y = self._normalize_coord(coord[0], coord[1], width, height)
                low_level_instruction = f"{action_name} at normalized ({coord[0]},{coord[1]}) -> pixel ({adj_x},{adj_y})"
            else:
                low_level_instruction = f"{action_name}: {json.dumps({k:v for k,v in action_data.items() if k != 'action' and v is not None})}"

        logger.info(f"[Step {current_step + 1}] Pyautogui: {pyautogui_code}")

        self.actions.append(low_level_instruction)

        return response_text, pyautogui_code

    def call_llm(self, messages):
        """
        Call Holo3 via vLLM with structured_outputs for guaranteed JSON action parsing.
        Returns (response_text, action_dict) where action_dict has the parsed action fields.
        """
        base_url = os.environ.get("HOLO3_BASE_URL", self.base_url)
        api_key = os.environ.get("HOLO3_API_KEY", self.api_key)
        client = openai.OpenAI(base_url=base_url, api_key=api_key)

        for attempt in range(1, MAX_RETRY_TIMES + 1):
            logger.info(
                f"[Holo3] Generating with model: {self.model} "
                f"(attempt {attempt}/{MAX_RETRY_TIMES})"
            )
            try:
                call_start = time.time()
                response = client.chat.completions.create(
                    model=self.model,
                    messages=messages,
                    max_tokens=self.max_tokens,
                    temperature=self.temperature,
                    top_p=self.top_p,
                    extra_body={
                        "structured_outputs": {
                            "json": ComputerUseAction.model_json_schema()
                        }
                    },
                )
                call_duration = time.time() - call_start
                content = response.choices[0].message.content

                # Log token usage and latency
                usage = getattr(response, 'usage', None)
                if usage:
                    logger.info(f"[Holo3] Tokens: input={usage.prompt_tokens}, output={usage.completion_tokens}, total={usage.total_tokens}")
                logger.info(f"[Holo3] Latency: {call_duration:.1f}s")

                if not content:
                    continue

                # Parse the guaranteed JSON from structured_outputs
                try:
                    action_data = json.loads(content)
                    # Clean nulls
                    action_data = {k: v for k, v in action_data.items() if v is not None}
                    return content, action_data
                except json.JSONDecodeError:
                    logger.warning(f"[Holo3] Failed to parse structured output: {content}")
                    # Fallback: try to extract JSON from response
                    return content, self._fallback_parse(content)

            except Exception as e:
                logger.error(f"[Holo3] Error calling model: {e}")
                if attempt < MAX_RETRY_TIMES:
                    time.sleep(3 * attempt)
                    continue
                break
        return "", None

    def _fallback_parse(self, content):
        """Fallback parser if structured_outputs fails."""
        # Strip thinking
        text = content
        if "</think>" in text:
            text = text[text.index("</think>") + len("</think>"):].strip()

        # Try to find JSON object
        match = re.search(r'\{[^{}]+\}', text)
        if match:
            try:
                data = json.loads(match.group())
                return {k: v for k, v in data.items() if v is not None}
            except json.JSONDecodeError:
                pass
        return None

    def _normalize_coord(self, x, y, screen_width, screen_height):
        """Convert 0-1000 normalized coordinates to actual screen pixels."""
        return int(x / 1000 * screen_width), int(y / 1000 * screen_height)

    def action_to_pyautogui(self, action_data, screen_width, screen_height):
        """Convert structured action dict to pyautogui code list. Sets self.last_error on failure."""
        if not action_data:
            self.last_error = "Failed to parse model output. No valid action JSON was returned. Please output a valid action."
            return []
        if "action" not in action_data:
            self.last_error = f"Missing 'action' field in response: {json.dumps(action_data)}. You must include an 'action' field."
            return []

        action = action_data["action"]
        # Normalize action aliases
        action_aliases = {
            "click": "left_click",
            "right-click": "right_click",
            "double-click": "double_click",
            "middle-click": "middle_click",
        }
        action = action_aliases.get(action, action)

        valid_actions = {
            "left_click", "right_click", "middle_click", "double_click",
            "type", "key", "scroll", "wait", "terminate",
            "mouse_move", "left_click_drag",
        }
        if action not in valid_actions:
            self.last_error = (
                f"Unknown action '{action}'. Valid actions are: {', '.join(sorted(valid_actions))}. "
                f"Use 'left_click' instead of 'click'."
            )
            return []

        coord = action_data.get("coordinate")
        pyautogui_code = []

        if coord and len(coord) >= 2:
            x, y = coord[0], coord[1]
            if not (0 <= x <= 1000 and 0 <= y <= 1000):
                self.last_error = (
                    f"Coordinate [{x}, {y}] is out of range. "
                    f"Coordinates must be in 0-1000 normalized space."
                )
                return []
            adj_x, adj_y = self._normalize_coord(x, y, screen_width, screen_height)
        else:
            adj_x, adj_y = None, None

        click_actions = {"left_click", "right_click", "middle_click", "double_click", "mouse_move", "left_click_drag"}
        if action in click_actions and adj_x is None:
            self.last_error = (
                f"Action '{action}' requires a 'coordinate' field with [x, y] values in 0-1000 range."
            )
            return []

        if action == "type" and not action_data.get("text"):
            self.last_error = "Action 'type' requires a 'text' field. Please provide the text to type."
            return []

        if action == "key" and not action_data.get("keys"):
            self.last_error = "Action 'key' requires a 'keys' field (list of key names). Example: ['ctrl', 'c']"
            return []

        if action == "left_click":
            pyautogui_code.append(f"pyautogui.click({adj_x}, {adj_y})")
        elif action == "right_click":
            pyautogui_code.append(f"pyautogui.rightClick({adj_x}, {adj_y})")
        elif action == "middle_click":
            pyautogui_code.append(f"pyautogui.middleClick({adj_x}, {adj_y})")
        elif action == "double_click":
            pyautogui_code.append(f"pyautogui.doubleClick({adj_x}, {adj_y})")
        elif action == "type":
            text = action_data["text"]
            pyautogui_code.append(f"pyautogui.typewrite('{text}')")
        elif action == "key":
            keys = [k.strip() for k in action_data["keys"] if isinstance(k, str)]
            keys_str = ", ".join([f"'{k}'" for k in keys])
            if len(keys) > 1:
                pyautogui_code.append(f"pyautogui.hotkey({keys_str})")
            else:
                pyautogui_code.append(f"pyautogui.press({keys_str})")
        elif action == "scroll":
            pixels = action_data.get("pixels", 0)
            pyautogui_code.append(f"pyautogui.scroll({pixels})")
        elif action == "wait":
            pyautogui_code.append("WAIT")
        elif action == "terminate":
            pyautogui_code.append("DONE")
        elif action == "mouse_move":
            pyautogui_code.append(f"pyautogui.moveTo({adj_x}, {adj_y})")
        elif action == "left_click_drag":
            pyautogui_code.append(f"pyautogui.dragTo({adj_x}, {adj_y}, duration=0.5)")

        self.last_error = None
        return pyautogui_code

    def reset(self, _logger=None, **kwargs):
        global logger
        logger = (
            _logger
            if _logger is not None
            else logging.getLogger("desktopenv.holo3_agent")
        )

        self.thoughts = []
        self.actions = []
        self.observations = []
        self.responses = []
        self.screenshots = []
        self.last_error = None
Wenjin0421 changed discussion title from Is any special design on agentical framwork? memory/planning? I only got 23% score on OSWorld to Is there any special design on agentical framwork? memory/planning? I only got 23% score on OSWorld

This is a good question

H company org

Hi @Wenjin0421 ,

Thanks for trying out Holo3! We actually just released a new cookbook that can help you design your agentic loop and improve your OSWorld score.
You can check it out here: https://hub.hcompany.ai/quickstart
Let us know how it goes!

Thanks for the detailed example! I will have a try :)

Sign up or log in to comment