"""Task definitions for the Financial Task Environment. Contains 10 tasks backed by real Excel workbooks covering diverse enterprise finance & accounting workflows (QA, calculation, validation, data entry, formatting, modeling, consolidation). Each task ships a source .xlsx that the agent must read or modify via Python code execution. """ from __future__ import annotations import os from pathlib import Path from typing import Any, Dict, List # Base directory where xlsx files live (data//) DATA_DIR = Path(os.environ.get("FINANCIAL_ENV_DATA_DIR", Path(__file__).parent / "data")) TASKS: Dict[str, Dict[str, Any]] = {} # --------------------------------------------------------------------------- # Helper to build source / reference paths # --------------------------------------------------------------------------- def _paths(task_id: str, src: str, ref: str | None = None): """Return dict with resolved source and optional reference paths.""" d: Dict[str, Any] = { "source_file": str(DATA_DIR / task_id / src), } if ref: d["reference_file"] = str(DATA_DIR / task_id / ref) return d # ── EASY ────────────────────────────────────────────────────────────────── # Task 1 — QA: count rows (Calculation) TASKS["task_1"] = { "id": "task_1", "orig_id": "119", "title": "Count Plants in Spreadsheet", "difficulty": "easy", "task_type": "QA", "category": "Calculation", "instruction": "How many plants are recorded in the spreadsheet?", "constraints": "", "reference_answer": "85", **_paths("119", "119_src_0.xlsx"), } # Task 2 — QA: value retrieval (Cross-sheet Retrieval) TASKS["task_2"] = { "id": "task_2", "orig_id": "118", "title": "Retrieve TW EOL Charge", "difficulty": "easy", "task_type": "QA", "category": "Cross-sheet/file Retrieval", "instruction": "What is the TW EOL charge for 2002? Please provide just the amount.", "constraints": "", "reference_answer": "113291", **_paths("118", "118_src_0.xlsx"), } # Task 3 — QA: multi-step calculation (Calculation) TASKS["task_3"] = { "id": "task_3", "orig_id": "34", "title": "Portfolio Mark-to-Market Change", "difficulty": "easy", "task_type": "QA", "category": "Calculation", "instruction": ( "Assume the following changes occur in the Jul\u2013Dec 2002 market: " "Flat curve prices increase uniformly by $2/MWh; Peak 6x16 curve prices " "increase uniformly by $5/MWh; monthly contract volumes (Flat and Peak " "Total MWh) remain unchanged. Based on the 2002 table, calculate: " "(1) the total added value (mark-to-market change) for the combined " "Flat + Peak portfolio; and (2) what percentage of this added value " "comes from the Peak 6x16 contracts rather than the Flat contracts." ), "constraints": "", "reference_answer": ( "The total added value of the July\u2013December 2002 portfolio is " "$1,989,600 (in absolute terms). Of this amount, approximately 27.9% " "(about 28%) comes from the Peak 6x16 contracts, with the remaining " "~72.1% coming from the Flat contracts." ), **_paths("34", "34_src_0.xlsx"), } # ── MEDIUM ──────────────────────────────────────────────────────────────── # Task 4 — Modify: summarise imbalances (Calculation + modify) TASKS["task_4"] = { "id": "task_4", "orig_id": "35", "title": "Summarize Pipeline Imbalances", "difficulty": "medium", "task_type": "MODIFY", "category": "Calculation", "instruction": ( "Summarize the volume and dollar imbalances that exist between the " "various pipeline operators (Operators) and Transwestern." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("35", "35_src_0.xlsx", "35_ref_0.xlsx"), } # Task 5 — Modify: audit & fix formulas (Validation / Review) TASKS["task_5"] = { "id": "task_5", "orig_id": "40", "title": "Audit and Correct Formula Errors", "difficulty": "medium", "task_type": "MODIFY", "category": "Validation / Review, Calculation", "instruction": ( "Audit the workbook and correct the formula errors in place so numbers " "calculate properly." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("40", "40_src_0.xlsx", "40_ref_0.xlsx"), } # Task 6 — Modify: create table + filter (Structuring / Formatting) TASKS["task_6"] = { "id": "task_6", "orig_id": "60", "title": "Create Table and Apply Filter", "difficulty": "medium", "task_type": "MODIFY", "category": "Structuring / Formatting", "instruction": ( "On the All Natural Gas sheet, create an Excel table and filter to " "show only the COUNTERPARTY entries highlighted in red." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("60", "60_src_0.xlsx", "60_ref_0.xlsx"), } # Task 7 — Modify: data entry + formatting (Data Entry / Import) TASKS["task_7"] = { "id": "task_7", "orig_id": "21", "title": "Add Weekday Row and Data Entry", "difficulty": "medium", "task_type": "MODIFY", "category": "Data Entry / Import, Structuring / Formatting", "instruction": ( "Add a weekday line directly below the date headers and update the " "12/31/2001 (Mon) column. For that day, there are no \"Receipts\"; " "record disbursements of $1,980,800 to Calpine (Power Purchases) and " "$100,000 to an unspecified vendor (Gas Purchases). Under Enron Facility " "Services, enter $3,101,855 for \"$2.5 per day\" and -$2,081,386 for " "\"estimate receipt\"; in Personnel, EES is $584,500; leave all other " "items as \"-\"." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("21", "21_src_0.xlsx", "21_ref_0.xlsx"), } # ── HARD ────────────────────────────────────────────────────────────────── # Task 8 — Modify: balance-sheet validation + indicator calcs TASKS["task_8"] = { "id": "task_8", "orig_id": "0", "title": "Balance Sheet Validation and Indicators", "difficulty": "hard", "task_type": "MODIFY", "category": "Validation / Review, Calculation, Structuring / Formatting", "instruction": ( "Complete the validation and indicator calculations as follows: on the " "Balance Sheet, add a control to ensure TOTAL ASSETS equals TOTAL " "LIABILITIES AND EQUITY; on the Income Statement (Revenue & Expenses), " "add an Equity Roll Forward Test to reconcile equity movement and " "highlight any differences." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("0", "0_src_0.xlsx", "0_ref_0.xlsx"), } # Task 9 — Modify: add new sheet mirroring structure (Financial Modeling) TASKS["task_9"] = { "id": "task_9", "orig_id": "24", "title": "Create Scenario3 Worksheet", "difficulty": "hard", "task_type": "MODIFY", "category": "Structuring / Formatting, Financial Modeling", "instruction": ( 'Add a new worksheet named "Scenario3" to the same workbook, mirroring ' "the structure, row/column layout, monthly detail table, and chart area " 'of "Scenario1". For Scenario3, update the hedging assumptions to a ' "balanced allocation: 10-Yr 25%, 5-Yr 20%, 1-Yr 15%, May-Sep 20%, " "Q3 15%. Keep the note \"Maximum Monthly Average Short Position to " 'Cover (July Peak) = 30,508 MW" unchanged; only the new sheet should ' "be added, and formulas may be used within it." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("24", "24_src_0.xlsx", "24_ref_0.xlsx"), } # Task 10 — Modify: cross-sheet consolidation (multi-type) TASKS["task_10"] = { "id": "task_10", "orig_id": "67", "title": "Consolidate by Type and Area", "difficulty": "hard", "task_type": "MODIFY", "category": "Structuring / Formatting, Calculation, Validation / Review, Cross-sheet Retrieval", "instruction": ( "Create a new 'by type_area' worksheet based on the Summary and the " "other tabs. It should present two separate tables summarized by " "Imbal Type; within each table, consolidate by area, include Volume, " "Value and Date, and calculate totals. Finally, confirm that the value " "and volume totals tie to the totals shown on the Summary." ), "constraints": ( "You will be given an Excel file as input. Perform all required " "operations by modifying the existing workbook. You may add new sheets " "if necessary, but you must preserve all original sheets and their " "contents. Return the full updated workbook." ), **_paths("67", "67_src_0.xlsx", "67_ref_0.xlsx"), } # --------------------------------------------------------------------------- # Manifest loader — pulls additional tasks (Finch-50, OSWorld docx, PPTArena, # TSBench) from data/manifest.jsonl. Each manifest row already has the same # shape as the hand-written TASKS dict above, plus a `family` and `split` field. # --------------------------------------------------------------------------- import json _MANIFEST_PATH = Path(__file__).parent / "data" / "manifest.jsonl" def _difficulty_for(primary_tag: str) -> str: """Heuristic difficulty bucket from a Finch primary_tag.""" easy = {"Cross-sheet/file Retrieval", "Summary / Visualization"} hard = {"Financial Modeling", "Validation / Review"} if primary_tag in easy: return "easy" if primary_tag in hard: return "hard" return "medium" def _load_manifest() -> None: """Load manifest.jsonl rows into TASKS in-place. No-op if missing.""" if not _MANIFEST_PATH.exists(): return with open(_MANIFEST_PATH) as f: for line in f: line = line.strip() if not line: continue row = json.loads(line) tid = row["id"] if tid in TASKS: continue # don't overwrite hand-curated tasks tags = row.get("all_tags", []) primary = row.get("primary_tag", "") TASKS[tid] = { "id": tid, "orig_id": row.get("orig_id", ""), "title": f"{primary}: {row['instruction'][:60]}…", "difficulty": _difficulty_for(primary), "task_type": row.get("task_type", "MODIFY"), "category": ", ".join(tags) if tags else primary, "family": row.get("family", "xlsx"), "split": row.get("split", "train"), "primary_tag": primary, "all_tags": tags, "instruction": row["instruction"], "constraints": row.get("constraints", ""), "source_file": str(Path(__file__).parent / row["source_file"]), } if row.get("reference_file"): TASKS[tid]["reference_file"] = str(Path(__file__).parent / row["reference_file"]) # Pass through the docx evaluator block, resolving expected_files # to ABSOLUTE paths so the gold-stash dedup-by-string can match # them against the (already absolute) reference_file. if row.get("evaluator"): ev = dict(row["evaluator"]) if ev.get("checks"): repo_root = Path(__file__).parent new_checks = [] for c in ev["checks"]: nc = dict(c) nc["expected_files"] = [ str(repo_root / f) if f and not Path(f).is_absolute() else f for f in (c.get("expected_files") or []) ] new_checks.append(nc) ev["checks"] = new_checks TASKS[tid]["evaluator"] = ev _load_manifest() # --------------------------------------------------------------------------- # Helper accessors # --------------------------------------------------------------------------- # Hand-curated tasks come first (sorted numerically), then manifest-loaded ones. def _sort_key(tid: str): parts = tid.split("_") if len(parts) == 2 and parts[1].isdigit(): return (0, int(parts[1])) # task_ return (1, tid) # everything else (finch_, osworld_, …) TASK_IDS: List[str] = sorted(TASKS.keys(), key=_sort_key) def get_task(task_id: str) -> Dict[str, Any]: """Return a task dict by ID or raise KeyError.""" if task_id not in TASKS: raise KeyError( f"Unknown task_id '{task_id}'. Available: {len(TASK_IDS)} tasks." ) return TASKS[task_id] def list_tasks(split: str | None = None, family: str | None = None) -> List[Dict[str, str]]: """Return a summary list of tasks, optionally filtered by split or family.""" out = [] for tid in TASK_IDS: t = TASKS[tid] if split is not None and t.get("split", "train") != split: continue if family is not None and t.get("family", "xlsx") != family: continue out.append({ "id": t["id"], "title": t["title"], "difficulty": t["difficulty"], "task_type": t["task_type"], "category": t["category"], "family": t.get("family", "xlsx"), "split": t.get("split", "train"), }) return out def split_ids(split: str) -> List[str]: """Return task IDs in a given split (train/eval). Unmarked tasks count as train.""" return [tid for tid in TASK_IDS if TASKS[tid].get("split", "train") == split]