Instructions to use Hcompany/Holo3-35B-A3B with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Hcompany/Holo3-35B-A3B with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="Hcompany/Holo3-35B-A3B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoProcessor, AutoModelForMultimodalLM processor = AutoProcessor.from_pretrained("Hcompany/Holo3-35B-A3B") model = AutoModelForMultimodalLM.from_pretrained("Hcompany/Holo3-35B-A3B") messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use Hcompany/Holo3-35B-A3B with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Hcompany/Holo3-35B-A3B" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Hcompany/Holo3-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/Hcompany/Holo3-35B-A3B
- SGLang
How to use Hcompany/Holo3-35B-A3B with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Hcompany/Holo3-35B-A3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Hcompany/Holo3-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Hcompany/Holo3-35B-A3B" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Hcompany/Holo3-35B-A3B", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use Hcompany/Holo3-35B-A3B with Docker Model Runner:
docker model run hf.co/Hcompany/Holo3-35B-A3B
Is there any special design on agentical framwork? memory/planning? I only got 23% score on OSWorld
Hi Hcomany,
Thanks for sharing the model! I self-deployed via vllm. It works well to estimate the coordination of UI element with structure output. I implemented a simple agent flow via OSWorld framework but only get 23% success rate. I attached holo3_agent.py with predict implementaion for your reference. Any suggestion to reproduce the OSWorld experiment. Thanks!
import base64
import json
import logging
import re
import time
import os
from io import BytesIO
from typing import Dict, List, Tuple
import openai
from PIL import Image
from pydantic import BaseModel, Field
logger = None
MAX_RETRY_TIMES = 5
def encode_image(image_content):
"""Raw base64 encode โ no resize (official best practice)."""
return base64.b64encode(image_content).decode("utf-8")
# --- Structured output schemas for guaranteed JSON parsing ---
class ComputerUseAction(BaseModel):
"""Schema for structured_outputs: the model returns exactly this JSON."""
action: str = Field(description="The action to perform: left_click, right_click, middle_click, double_click, type, key, mouse_move, left_click_drag, scroll, wait, terminate")
coordinate: List[int] = Field(default=None, description="[x, y] coordinates in 0-1000 normalized space")
keys: List[str] = Field(default=None, description="Keys to press (for action=key)")
text: str = Field(default=None, description="Text to type (for action=type)")
pixels: int = Field(default=None, description="Scroll amount (for action=scroll)")
time: float = Field(default=None, description="Seconds to wait (for action=wait)")
status: str = Field(default=None, description="Task status: success or failure (for action=terminate)")
class Holo3Agent:
def __init__(
self,
platform: str = "ubuntu",
model: str = "holo3-35B-A3B",
max_tokens: int = 1024,
top_p: float = 0.9,
temperature: float = 0.0,
action_space: str = "pyautogui",
observation_type: str = "screenshot",
history_n: int = 1,
add_thought_prefix: bool = False,
coordinate_type: str = "relative",
base_url: str = "http://localhost:8000/v1",
api_key: str = "EMPTY",
provider_name: str = "aws",
screen_size: Tuple[int, int] = (1920, 1080),
**kwargs,
):
self.platform = platform
self.model = model
self.max_tokens = max_tokens
self.top_p = top_p
self.temperature = temperature
self.action_space = action_space
self.observation_type = observation_type
self.history_n = history_n
self.add_thought_prefix = add_thought_prefix
self.coordinate_type = coordinate_type
self.base_url = base_url
self.api_key = api_key
self.provider_name = provider_name
self.screen_size = screen_size
assert action_space in ["pyautogui"], "Invalid action space"
assert observation_type in ["screenshot"], "Invalid observation type"
self.thoughts = []
self.actions = []
self.observations = []
self.responses = []
self.screenshots = []
self.last_error = None # Error feedback for next step
def predict(self, instruction: str, obs: Dict) -> List:
"""
Predict the next action(s) based on the current observation.
Returns (response, pyautogui_code).
"""
screenshot_bytes = obs["screenshot"]
image = Image.open(BytesIO(screenshot_bytes))
width, height = image.size
print(f"Screen resolution: {width}x{height}")
# No resize โ send raw screenshot (official best practice)
screenshot_b64 = encode_image(screenshot_bytes)
self.screenshots.append(screenshot_b64)
current_step = len(self.actions)
history_start_idx = max(0, current_step - self.history_n)
previous_actions = []
for i in range(history_start_idx):
if i < len(self.actions):
previous_actions.append(f"Step {i+1}: {self.actions[i]}")
previous_actions_str = (
"\n".join(previous_actions) if previous_actions else "None"
)
system_prompt = (
"You are a computer use agent that follows instructions and performs desktop computer tasks.\n"
"You have good knowledge of computers and the internet.\n"
"For each step, you will get a screenshot of the computer screen and you will predict the next action.\n\n"
"ENVIRONMENT:\n"
"* This is an Ubuntu desktop GUI with internet access.\n"
"* You must click on desktop icons to start applications.\n"
"* Some applications may take time to start or process actions, so you may need to wait and observe.\n"
"* When viewing a page, make sure you scroll down to see everything before deciding something isn't available.\n"
"* The screen uses a 1000x1000 coordinate system. All coordinates must be in this 0-1000 range.\n"
"* Click buttons and icons in their center, not on edges.\n"
"* If a click didn't work, try adjusting the coordinates slightly.\n"
"* The computer's password is 'osworld-public-evaluation' if you need sudo rights.\n\n"
"ACTIONS (output as JSON with 'action' field and relevant parameters):\n"
"- left_click: click at coordinate [x, y]\n"
"- right_click: right-click at coordinate [x, y]\n"
"- double_click: double-click at coordinate [x, y]\n"
"- middle_click: middle-click at coordinate [x, y]\n"
"- mouse_move: move cursor to coordinate [x, y]\n"
"- left_click_drag: drag to coordinate [x, y]\n"
"- type: type text string (set 'text' field)\n"
"- key: press key combination (set 'keys' field, e.g. ['ctrl', 'c'])\n"
"- scroll: scroll by amount (set 'pixels' field, positive=up, negative=down)\n"
"- wait: wait for changes (set 'time' field in seconds)\n"
"- terminate: end the task (set 'status' to 'success' or 'failure')\n\n"
"RULES:\n"
"* First, briefly reflect on the current screenshot and what has been done so far.\n"
"* Then decide the single best next action.\n"
"* Be careful to ensure coordinates are correct by examining the screenshot closely.\n"
"* Do NOT terminate until you have visually confirmed the task is fully complete in the screenshot.\n"
"* Do NOT easily declare failure. Try your best to complete the task.\n"
"* Only terminate with status='success' when you can see clear evidence the task goal has been achieved.\n"
)
instruction_prompt = (
f"Task: {instruction}\n\n"
f"Previous actions:\n{previous_actions_str}\n\n"
)
if self.last_error:
instruction_prompt += (
f"ERROR from previous step: {self.last_error}\n"
f"Please correct your action based on this error.\n\n"
)
self.last_error = None # Clear after injecting
instruction_prompt += (
f"Look at the current screenshot carefully. Reflect on what you see and what has been done so far, "
f"then predict the next action to complete the task."
)
# Build messages
messages = [
{"role": "system", "content": [{"type": "text", "text": system_prompt}]}
]
history_len = min(self.history_n, len(self.responses))
if history_len > 0:
history_responses = self.responses[-history_len:]
history_screenshots = self.screenshots[-history_len - 1 : -1]
for idx in range(history_len):
if idx < len(history_screenshots):
img_url = f"data:image/png;base64,{history_screenshots[idx]}"
if idx == 0:
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": img_url}},
{"type": "text", "text": instruction_prompt},
],
})
else:
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": img_url}},
],
})
messages.append({
"role": "assistant",
"content": [{"type": "text", "text": history_responses[idx]}],
})
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
],
})
else:
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot_b64}"}},
{"type": "text", "text": instruction_prompt},
],
})
# --- Log input prompt (text parts only, no base64 images) ---
prompt_log_lines = []
for msg in messages:
role = msg.get("role", "")
content = msg.get("content", [])
if isinstance(content, str):
prompt_log_lines.append(f"[{role}] {content}")
elif isinstance(content, list):
for part in content:
if isinstance(part, dict):
if part.get("type") == "text":
prompt_log_lines.append(f"[{role}] {part['text']}")
elif part.get("type") == "image_url":
prompt_log_lines.append(f"[{role}] <image>")
input_prompt_text = "\n".join(prompt_log_lines)
logger.info(f"[Step {current_step + 1}] === INPUT PROMPT ===\n{input_prompt_text}")
response_text, action_data = self.call_llm(messages)
# --- Log output response ---
logger.info(f"[Step {current_step + 1}] === MODEL OUTPUT ===\n{response_text}")
logger.info(f"[Step {current_step + 1}] Parsed action: {json.dumps(action_data) if action_data else 'None'}")
self.responses.append(response_text)
pyautogui_code = self.action_to_pyautogui(action_data, width, height)
if self.last_error:
logger.warning(f"[Step {current_step + 1}] ACTION ERROR: {self.last_error}")
low_level_instruction = ""
if action_data:
action_name = action_data.get("action", "unknown")
coord = action_data.get("coordinate")
if coord:
adj_x, adj_y = self._normalize_coord(coord[0], coord[1], width, height)
low_level_instruction = f"{action_name} at normalized ({coord[0]},{coord[1]}) -> pixel ({adj_x},{adj_y})"
else:
low_level_instruction = f"{action_name}: {json.dumps({k:v for k,v in action_data.items() if k != 'action' and v is not None})}"
logger.info(f"[Step {current_step + 1}] Pyautogui: {pyautogui_code}")
self.actions.append(low_level_instruction)
return response_text, pyautogui_code
def call_llm(self, messages):
"""
Call Holo3 via vLLM with structured_outputs for guaranteed JSON action parsing.
Returns (response_text, action_dict) where action_dict has the parsed action fields.
"""
base_url = os.environ.get("HOLO3_BASE_URL", self.base_url)
api_key = os.environ.get("HOLO3_API_KEY", self.api_key)
client = openai.OpenAI(base_url=base_url, api_key=api_key)
for attempt in range(1, MAX_RETRY_TIMES + 1):
logger.info(
f"[Holo3] Generating with model: {self.model} "
f"(attempt {attempt}/{MAX_RETRY_TIMES})"
)
try:
call_start = time.time()
response = client.chat.completions.create(
model=self.model,
messages=messages,
max_tokens=self.max_tokens,
temperature=self.temperature,
top_p=self.top_p,
extra_body={
"structured_outputs": {
"json": ComputerUseAction.model_json_schema()
}
},
)
call_duration = time.time() - call_start
content = response.choices[0].message.content
# Log token usage and latency
usage = getattr(response, 'usage', None)
if usage:
logger.info(f"[Holo3] Tokens: input={usage.prompt_tokens}, output={usage.completion_tokens}, total={usage.total_tokens}")
logger.info(f"[Holo3] Latency: {call_duration:.1f}s")
if not content:
continue
# Parse the guaranteed JSON from structured_outputs
try:
action_data = json.loads(content)
# Clean nulls
action_data = {k: v for k, v in action_data.items() if v is not None}
return content, action_data
except json.JSONDecodeError:
logger.warning(f"[Holo3] Failed to parse structured output: {content}")
# Fallback: try to extract JSON from response
return content, self._fallback_parse(content)
except Exception as e:
logger.error(f"[Holo3] Error calling model: {e}")
if attempt < MAX_RETRY_TIMES:
time.sleep(3 * attempt)
continue
break
return "", None
def _fallback_parse(self, content):
"""Fallback parser if structured_outputs fails."""
# Strip thinking
text = content
if "</think>" in text:
text = text[text.index("</think>") + len("</think>"):].strip()
# Try to find JSON object
match = re.search(r'\{[^{}]+\}', text)
if match:
try:
data = json.loads(match.group())
return {k: v for k, v in data.items() if v is not None}
except json.JSONDecodeError:
pass
return None
def _normalize_coord(self, x, y, screen_width, screen_height):
"""Convert 0-1000 normalized coordinates to actual screen pixels."""
return int(x / 1000 * screen_width), int(y / 1000 * screen_height)
def action_to_pyautogui(self, action_data, screen_width, screen_height):
"""Convert structured action dict to pyautogui code list. Sets self.last_error on failure."""
if not action_data:
self.last_error = "Failed to parse model output. No valid action JSON was returned. Please output a valid action."
return []
if "action" not in action_data:
self.last_error = f"Missing 'action' field in response: {json.dumps(action_data)}. You must include an 'action' field."
return []
action = action_data["action"]
# Normalize action aliases
action_aliases = {
"click": "left_click",
"right-click": "right_click",
"double-click": "double_click",
"middle-click": "middle_click",
}
action = action_aliases.get(action, action)
valid_actions = {
"left_click", "right_click", "middle_click", "double_click",
"type", "key", "scroll", "wait", "terminate",
"mouse_move", "left_click_drag",
}
if action not in valid_actions:
self.last_error = (
f"Unknown action '{action}'. Valid actions are: {', '.join(sorted(valid_actions))}. "
f"Use 'left_click' instead of 'click'."
)
return []
coord = action_data.get("coordinate")
pyautogui_code = []
if coord and len(coord) >= 2:
x, y = coord[0], coord[1]
if not (0 <= x <= 1000 and 0 <= y <= 1000):
self.last_error = (
f"Coordinate [{x}, {y}] is out of range. "
f"Coordinates must be in 0-1000 normalized space."
)
return []
adj_x, adj_y = self._normalize_coord(x, y, screen_width, screen_height)
else:
adj_x, adj_y = None, None
click_actions = {"left_click", "right_click", "middle_click", "double_click", "mouse_move", "left_click_drag"}
if action in click_actions and adj_x is None:
self.last_error = (
f"Action '{action}' requires a 'coordinate' field with [x, y] values in 0-1000 range."
)
return []
if action == "type" and not action_data.get("text"):
self.last_error = "Action 'type' requires a 'text' field. Please provide the text to type."
return []
if action == "key" and not action_data.get("keys"):
self.last_error = "Action 'key' requires a 'keys' field (list of key names). Example: ['ctrl', 'c']"
return []
if action == "left_click":
pyautogui_code.append(f"pyautogui.click({adj_x}, {adj_y})")
elif action == "right_click":
pyautogui_code.append(f"pyautogui.rightClick({adj_x}, {adj_y})")
elif action == "middle_click":
pyautogui_code.append(f"pyautogui.middleClick({adj_x}, {adj_y})")
elif action == "double_click":
pyautogui_code.append(f"pyautogui.doubleClick({adj_x}, {adj_y})")
elif action == "type":
text = action_data["text"]
pyautogui_code.append(f"pyautogui.typewrite('{text}')")
elif action == "key":
keys = [k.strip() for k in action_data["keys"] if isinstance(k, str)]
keys_str = ", ".join([f"'{k}'" for k in keys])
if len(keys) > 1:
pyautogui_code.append(f"pyautogui.hotkey({keys_str})")
else:
pyautogui_code.append(f"pyautogui.press({keys_str})")
elif action == "scroll":
pixels = action_data.get("pixels", 0)
pyautogui_code.append(f"pyautogui.scroll({pixels})")
elif action == "wait":
pyautogui_code.append("WAIT")
elif action == "terminate":
pyautogui_code.append("DONE")
elif action == "mouse_move":
pyautogui_code.append(f"pyautogui.moveTo({adj_x}, {adj_y})")
elif action == "left_click_drag":
pyautogui_code.append(f"pyautogui.dragTo({adj_x}, {adj_y}, duration=0.5)")
self.last_error = None
return pyautogui_code
def reset(self, _logger=None, **kwargs):
global logger
logger = (
_logger
if _logger is not None
else logging.getLogger("desktopenv.holo3_agent")
)
self.thoughts = []
self.actions = []
self.observations = []
self.responses = []
self.screenshots = []
self.last_error = None
This is a good question
Hi @Wenjin0421 ,
Thanks for trying out Holo3! We actually just released a new cookbook that can help you design your agentic loop and improve your OSWorld score.
You can check it out here: https://hub.hcompany.ai/quickstart
Let us know how it goes!
Thanks for the detailed example! I will have a try :)