[
  {
    "avg_reward": 0.6822916666666667,
    "avg_steps": 4.025,
    "benchmark": "WebShop",
    "episodes": 200,
    "method": "Stochastic-TBRM",
    "notes": "Fine macro-action Stochastic-TBRM checkpoint for WebShop. Evaluation disables structured product scoring, structured option scoring, learned click rankers, and action repair.",
    "path": "tbrm/webshop-no-scorer",
    "success_rate": 1.0
  },
  {
    "avg_reward": 0.32298518459516506,
    "avg_steps": 25.928909952606634,
    "benchmark": "ScienceWorld",
    "episodes": 211,
    "method": "Stochastic-TBRM",
    "notes": "Stochastic-TBRM checkpoint evaluated on the 211-task ScienceWorld test split.",
    "path": "tbrm/scienceworld",
    "success_rate": 0.3886255924170616
  },
  {
    "avg_reward": 0.4701492537313433,
    "avg_steps": 26.044776119402986,
    "benchmark": "ALFWorld",
    "episodes": 134,
    "method": "Stochastic-TBRM",
    "notes": "Refined target-aware ALFWorld Stochastic-TBRM checkpoint with unavailable-macro masks.",
    "path": "tbrm/alfworld",
    "success_rate": 0.4701492537313433
  },
  {
    "avg_reward": 0.649,
    "avg_steps": 3.97,
    "benchmark": "WebShop",
    "episodes": 200,
    "method": "DMPO-Qwen3.5-4B",
    "path": "dmpo-qwen35/webshop",
    "success_rate": 0.955
  },
  {
    "avg_reward": 0.4111506774905855,
    "avg_steps": 30.66350710900474,
    "benchmark": "ScienceWorld",
    "episodes": 211,
    "method": "DMPO-Qwen3.5-4B",
    "path": "dmpo-qwen35/scienceworld",
    "success_rate": 0.4218009478672986
  },
  {
    "avg_reward": 0.5746268656716418,
    "avg_steps": 25.37313432835821,
    "benchmark": "ALFWorld",
    "episodes": 134,
    "method": "DMPO-Qwen3.5-4B",
    "path": "dmpo-qwen35/alfworld",
    "success_rate": 0.5746268656716418
  }
]