[ { "avg_reward": 0.6822916666666667, "avg_steps": 4.025, "benchmark": "WebShop", "episodes": 200, "method": "Stochastic-TBRM", "notes": "Fine macro-action Stochastic-TBRM checkpoint for WebShop. Evaluation disables structured product scoring, structured option scoring, learned click rankers, and action repair.", "path": "tbrm/webshop-no-scorer", "success_rate": 1.0 }, { "avg_reward": 0.32298518459516506, "avg_steps": 25.928909952606634, "benchmark": "ScienceWorld", "episodes": 211, "method": "Stochastic-TBRM", "notes": "Stochastic-TBRM checkpoint evaluated on the 211-task ScienceWorld test split.", "path": "tbrm/scienceworld", "success_rate": 0.3886255924170616 }, { "avg_reward": 0.4701492537313433, "avg_steps": 26.044776119402986, "benchmark": "ALFWorld", "episodes": 134, "method": "Stochastic-TBRM", "notes": "Refined target-aware ALFWorld Stochastic-TBRM checkpoint with unavailable-macro masks.", "path": "tbrm/alfworld", "success_rate": 0.4701492537313433 }, { "avg_reward": 0.649, "avg_steps": 3.97, "benchmark": "WebShop", "episodes": 200, "method": "DMPO-Qwen3.5-4B", "path": "dmpo-qwen35/webshop", "success_rate": 0.955 }, { "avg_reward": 0.4111506774905855, "avg_steps": 30.66350710900474, "benchmark": "ScienceWorld", "episodes": 211, "method": "DMPO-Qwen3.5-4B", "path": "dmpo-qwen35/scienceworld", "success_rate": 0.4218009478672986 }, { "avg_reward": 0.5746268656716418, "avg_steps": 25.37313432835821, "benchmark": "ALFWorld", "episodes": 134, "method": "DMPO-Qwen3.5-4B", "path": "dmpo-qwen35/alfworld", "success_rate": 0.5746268656716418 } ]