Spaces:

ShubhamRasal
/

robot-policy-eval

Running

Shubham-Rasal Claude Sonnet 4.6 commited on 1 day ago

Commit

6156e81

1 Parent(s): 9459552

Add PhAIL tab: 4 autonomous VLA policies head-to-head on Franka

Adds a new first tab that loads the 20-episode stratified sample
from phail-anon/phail-v1.0 — ACT, GR00T N1.6, π0.5, SmolVLA —
running autonomously on a Franka Research 3 bin-to-place task.
No GPU required; scores pre-recorded rollout parquets via the
existing Bayesian + SPARC + STL pipeline.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

app.py +157 -2
requirements.txt +2 -0

app.py CHANGED Viewed

@@ -12,8 +12,9 @@ from plotly.subplots import make_subplots
 from scipy.fft import rfft, rfftfreq
 from scipy import stats
 from datasets import load_dataset
 import gradio as gr
-import io, json
 # ── constants ────────────────────────────────────────────────────────────────
 PALETTE   = ["#60A5FA", "#FB923C", "#F87171", "#34D399", "#A78BFA"]
@@ -81,6 +82,115 @@ def ci_lower(s, n, N=100_000):
 # ── load ALOHA demo data ──────────────────────────────────────────────────────
 _cache = {}
 def load_aloha():
     if "aloha" in _cache:
         return _cache["aloha"]
@@ -492,7 +602,52 @@ def build_ui():
         with gr.Tabs():
-            # ── TAB 1: DEMO DATA ──────────────────────────────────────────────
             with gr.Tab("🦾 Try with Real ALOHA Data"):
                 gr.Markdown("""
                 **Dataset**: [`lerobot/aloha_static_cups_open`](https://huggingface.co/datasets/lerobot/aloha_static_cups_open)

 from scipy.fft import rfft, rfftfreq
 from scipy import stats
 from datasets import load_dataset
+from huggingface_hub import hf_hub_download, list_repo_files
 import gradio as gr
+import io, json, os, tempfile
 # ── constants ────────────────────────────────────────────────────────────────
 PALETTE   = ["#60A5FA", "#FB923C", "#F87171", "#34D399", "#A78BFA"]
 # ── load ALOHA demo data ──────────────────────────────────────────────────────
 _cache = {}
+# ── load PhAIL sample (4 autonomous VLA policies on Franka) ──────────────────
+PHAIL_POLICIES = {
+    "act":     "ACT",
+    "groot":   "GR00T N1.6",
+    "openpi":  "π0.5",
+    "smolvla": "SmolVLA",
+}
+def load_phail_sample(progress=None):
+    """
+    Download the 20-episode stratified sample from phail-anon/phail-v1.0.
+    Returns policy_data dict ready for run_analysis().
+    """
+    if "phail" in _cache:
+        return _cache["phail"]
+    if progress:
+        progress(0.05, desc="Listing PhAIL sample files on HuggingFace Hub…")
+    # Collect parquet paths under sample/inference/
+    all_files = list(list_repo_files("phail-anon/phail-v1.0", repo_type="dataset"))
+    sample_parquets = [f for f in all_files
+                       if f.startswith("sample/inference/") and f.endswith(".parquet")]
+    if not sample_parquets:
+        raise ValueError("No sample parquet files found in phail-anon/phail-v1.0")
+    if progress:
+        progress(0.15, desc=f"Downloading {len(sample_parquets)} episode files…")
+    policy_data = {label: {"trials": [], "speeds": [], "efforts": [], "zs": []}
+                   for label in PHAIL_POLICIES.values()}
+    for i, fpath in enumerate(sample_parquets):
+        if progress:
+            progress(0.15 + 0.7 * (i / len(sample_parquets)),
+                     desc=f"Processing episode {i+1}/{len(sample_parquets)}…")
+        # Identify policy from path  sample/inference/<model>/batch_*/episode_*/
+        parts = fpath.split("/")
+        model_key = parts[2] if len(parts) > 2 else None
+        label = PHAIL_POLICIES.get(model_key)
+        if label is None:
+            continue
+        local = hf_hub_download(repo_id="phail-anon/phail-v1.0",
+                                 filename=fpath, repo_type="dataset")
+        ep = pd.read_parquet(local)
+        # Extract joint state columns (7-DOF Franka)
+        state_cols  = [c for c in ep.columns if "joint_position" in c or "q_" in c]
+        action_cols = [c for c in ep.columns if "joint_command" in c or "q_cmd" in c]
+        # Fallback: use all numeric columns if named columns absent
+        if not state_cols:
+            numeric = ep.select_dtypes(include=[np.number]).columns.tolist()
+            mid = len(numeric) // 2
+            state_cols  = numeric[:mid] or numeric
+            action_cols = numeric[mid:] or numeric
+        states  = ep[state_cols].values.astype(float)
+        actions = ep[action_cols].values.astype(float) if action_cols else states
+        if len(states) < 4:
+            continue
+        speed, effort, z = extract_episode(states, actions)
+        # Success label from static.json lives alongside parquet — infer from path
+        # PhAIL annotates eval.outcome: "Success" | "Stalled" | "Safety" | "Ran_out_of_time"
+        # Try to load meta.json; fall back to 0
+        meta_path = fpath.replace(".parquet", "").rstrip("/") + "/../static.json"
+        success = 0
+        try:
+            meta_local = hf_hub_download(
+                repo_id="phail-anon/phail-v1.0",
+                filename="/".join(parts[:-1]) + "/static.json",
+                repo_type="dataset")
+            with open(meta_local) as f:
+                meta = json.load(f)
+            outcome = meta.get("eval", {}).get("outcome", "")
+            success = 1 if outcome == "Success" else 0
+        except Exception:
+            pass
+        policy_data[label]["trials"].append(success)
+        policy_data[label]["speeds"].append(speed)
+        policy_data[label]["efforts"].append(effort)
+        policy_data[label]["zs"].append(z)
+    # Drop policies with no data
+    policy_data = {k: v for k, v in policy_data.items() if v["trials"]}
+    if progress:
+        progress(0.95, desc="Finalising…")
+    _cache["phail"] = policy_data
+    return policy_data
+def run_phail(progress=gr.Progress()):
+    progress(0, desc="Connecting to HuggingFace Hub…")
+    policy_data = load_phail_sample(progress)
+    progress(0.9, desc="Running Bayesian + SPARC + STL analysis…")
+    results = run_analysis(policy_data)
+    progress(1.0, desc="Done!")
+    return results
 def load_aloha():
     if "aloha" in _cache:
         return _cache["aloha"]
         with gr.Tabs():
+            # ── TAB 1: PhAIL — 4 autonomous VLA policies ──────────────────────
+            with gr.Tab("🏆 PhAIL: 4 VLA Policies Head-to-Head"):
+                gr.Markdown("""
+**Dataset**: [`phail-anon/phail-v1.0`](https://huggingface.co/datasets/phail-anon/phail-v1.0)
+— 20 stratified episodes from **4 real VLA policies** running autonomously on a **Franka Research 3** robot.
+No GPU needed — we're scoring pre-recorded rollouts, not running the policies.
+| Policy | Type | Developer |
+|--------|------|-----------|
+| ACT | Action Chunking Transformer | Academic (Chi et al.) |
+| GR00T N1.6 | Foundation model | NVIDIA |
+| π0.5 | Diffusion policy VLA | Physical Intelligence |
+| SmolVLA | Compact VLA | HuggingFace |
+Task: **bin-to-bin pick-and-place** (batteries, scissors, towels, wooden spoons).
+Success labels are human-verified from gripper telemetry.
+                """)
+                phail_btn = gr.Button("▶ Load & Analyse PhAIL Sample", variant="primary", size="lg")
+                with gr.Row():
+                    ph_bayes    = gr.Plot(label="Bayesian Posteriors")
+                    ph_bayes_mat = gr.Plot(label="P(row beats col)")
+                with gr.Row():
+                    ph_sparc    = gr.Plot(label="SPARC Smoothness")
+                    ph_speed    = gr.Plot(label="Speed Profiles")
+                with gr.Row():
+                    ph_stl      = gr.Plot(label="STL Robustness")
+                    ph_viols    = gr.Plot(label="Violations")
+                with gr.Row():
+                    ph_radar    = gr.Plot(label="Composite Radar")
+                    ph_rank     = gr.Plot(label="Final Ranking")
+                ph_scorecard = gr.Markdown(label="Scorecard")
+                phail_btn.click(
+                    fn=run_phail,
+                    inputs=[],
+                    outputs=[ph_bayes, ph_bayes_mat,
+                             ph_sparc, ph_speed,
+                             ph_stl,   ph_viols,
+                             ph_radar, ph_rank,
+                             ph_scorecard],
+                )
+            # ── TAB 2: DEMO DATA ──────────────────────────────────────────────
             with gr.Tab("🦾 Try with Real ALOHA Data"):
                 gr.Markdown("""
                 **Dataset**: [`lerobot/aloha_static_cups_open`](https://huggingface.co/datasets/lerobot/aloha_static_cups_open)

requirements.txt CHANGED Viewed

@@ -4,3 +4,5 @@ numpy>=1.24.0
 scipy>=1.10.0
 datasets>=2.14.0
 pandas>=2.0.0

 scipy>=1.10.0
 datasets>=2.14.0
 pandas>=2.0.0
+huggingface_hub>=0.23.0
+pyarrow>=14.0.0