Spaces:

lablab-ai-amd-developer-hackathon
/

ROCmPort-AI

Running

App Files Files Community

tazwarrrr commited on May 9

Commit

944f300

1 Parent(s): 0bfb9d6

Add Gradio Space: app.py and requirements.txt

Browse files

Files changed (2) hide show

app.py +306 -0
requirements.txt +1 -0

app.py ADDED Viewed

	@@ -0,0 +1,306 @@

+"""
+ROCmPort AI — Gradio Space entry point
+Calls the deployed FastAPI backend (Render) and streams agent events.
+"""
+import gradio as gr
+import httpx
+import json
+BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com"
+AGENT_ICONS = {
+    "analyzer": "🔍",
+    "translator": "🔄",
+    "optimizer": "⚡",
+    "tester": "🧪",
+    "coordinator": "🎯",
+}
+STATUS_ICONS = {
+    "waiting": "⏳",
+    "running": "🔄",
+    "done": "✅",
+    "failed": "❌",
+    "retrying": "🔁",
+}
+EXAMPLE_REDUCTION = """\
+__global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) {
+    extern __shared__ float sdata[];
+    unsigned int tid = threadIdx.x;
+    unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
+    float mySum = (i < n) ? g_idata[i] : 0;
+    if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
+    sdata[tid] = mySum;
+    __syncthreads();
+    for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
+        if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s];
+        __syncthreads();
+    }
+    // DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64)
+    if (tid < 32) {
+        volatile float* vsmem = sdata;
+        vsmem[tid] = mySum = mySum + vsmem[tid + 32];
+        vsmem[tid] = mySum = mySum + vsmem[tid + 16];
+        vsmem[tid] = mySum = mySum + vsmem[tid + 8];
+        vsmem[tid] = mySum = mySum + vsmem[tid + 4];
+        vsmem[tid] = mySum = mySum + vsmem[tid + 2];
+        vsmem[tid] = mySum = mySum + vsmem[tid + 1];
+    }
+    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}"""
+EXAMPLE_VECTOR_ADD = """\
+__global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < n) {
+        C[i] = A[i] + B[i];
+        // Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64)
+        if (threadIdx.x % 32 == 0) {
+            printf("Warp leader: %d\\n", threadIdx.x / 32);
+        }
+    }
+}"""
+EXAMPLE_MATMUL = """\
+__global__ void matmul(float *A, float *B, float *C, int N) {
+    __shared__ float As[32][32];
+    __shared__ float Bs[32][32];
+    int row = blockIdx.y * 32 + threadIdx.y;
+    int col = blockIdx.x * 32 + threadIdx.x;
+    float sum = 0.0f;
+    for (int k = 0; k < N / 32; k++) {
+        As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x];
+        Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col];
+        __syncthreads();
+        for (int n = 0; n < 32; n++)
+            sum += As[threadIdx.y][n] * Bs[n][threadIdx.x];
+        __syncthreads();
+    }
+    C[row * N + col] = sum;
+}"""
+def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool):
+    """Generator: streams agent events and yields (log_markdown, hip_code)."""
+    if not cuda_code or len(cuda_code.strip()) < 10:
+        yield "❌ Please provide CUDA kernel code (at least 10 characters).", ""
+        return
+    kernel_name = kernel_name.strip() or "custom"
+    log_lines: list[str] = []
+    hip_code = ""
+    payload = {
+        "cuda_code": cuda_code,
+        "kernel_name": kernel_name,
+        "simple_mode": bool(simple_mode),
+    }
+    log_lines.append("🚀 **Connecting to ROCmPort AI backend…**")
+    yield "\n\n".join(log_lines), hip_code
+    try:
+        with httpx.Client(timeout=180.0) as client:
+            with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp:
+                resp.raise_for_status()
+                for line in resp.iter_lines():
+                    if not line:
+                        continue
+                    if not line.startswith("data: "):
+                        continue
+                    data = line[6:]
+                    if data.strip() == "[DONE]":
+                        break
+                    try:
+                        event = json.loads(data)
+                    except json.JSONDecodeError:
+                        continue
+                    agent = event.get("agent", "system")
+                    status = event.get("status", "running")
+                    message = event.get("message", "")
+                    detail = event.get("detail") or ""
+                    icon = AGENT_ICONS.get(agent, "🤖")
+                    s_icon = STATUS_ICONS.get(status, "•")
+                    log_lines.append(f"{icon} **{agent.capitalize()}** {s_icon} — {message}")
+                    # Extract HIP code from coordinator or translator done events
+                    if status == "done" and detail:
+                        try:
+                            detail_json = json.loads(detail)
+                            candidate = (
+                                detail_json.get("hip_code")
+                                or detail_json.get("optimized_code")
+                                or detail_json.get("translated_code")
+                                or ""
+                            )
+                            if candidate:
+                                hip_code = candidate
+                        except (json.JSONDecodeError, AttributeError):
+                            pass
+                    yield "\n\n".join(log_lines), hip_code
+    except httpx.ConnectError:
+        log_lines.append(
+            "❌ **Could not connect to backend.**\n\n"
+            "> The server may be in a cold-start state — please wait ~30 s and retry."
+        )
+        yield "\n\n".join(log_lines), hip_code
+        return
+    except httpx.TimeoutException:
+        log_lines.append("⏱️ **Request timed out.** The pipeline may still be running — try again shortly.")
+        yield "\n\n".join(log_lines), hip_code
+        return
+    except httpx.HTTPStatusError as exc:
+        log_lines.append(f"❌ **HTTP {exc.response.status_code}**: {exc.response.text[:300]}")
+        yield "\n\n".join(log_lines), hip_code
+        return
+    except Exception as exc:  # noqa: BLE001
+        log_lines.append(f"❌ **Unexpected error**: {exc}")
+        yield "\n\n".join(log_lines), hip_code
+        return
+    if not hip_code:
+        log_lines.append("\n⚠️ Pipeline finished but no HIP code was extracted. Check agent logs above.")
+    else:
+        log_lines.append("\n✅ **Migration complete.** HIP code is shown on the right →")
+    yield "\n\n".join(log_lines), hip_code
+# ── UI ────────────────────────────────────────────────────────────────────────
+CSS = """
+.panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; }
+footer { display: none !important; }
+"""
+with gr.Blocks(
+    title="ROCmPort AI — CUDA → ROCm Migration",
+    theme=gr.themes.Default(primary_hue="orange"),
+    css=CSS,
+) as demo:
+    gr.Markdown(
+        """# ⚡ ROCmPort AI
+### Agentic CUDA → ROCm/HIP migration with wavefront-64 bug detection
+> **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) &nbsp;|&nbsp;
+> **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI)
+`hipify-clang` translates CUDA API calls mechanically — it **cannot** detect that `if (tid < 32)` in a
+warp-level reduction silently skips lanes 32–63 on AMD wavefront-64.
+The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.**
+"""
+    )
+    with gr.Row():
+        # ── Left: input ──────────────────────────────────────────────────────
+        with gr.Column(scale=1):
+            gr.Markdown("### 📥 Input", elem_classes="panel-header")
+            cuda_input = gr.Code(
+                label="CUDA Kernel Code",
+                language="c++",
+                lines=22,
+                value=EXAMPLE_REDUCTION,
+            )
+            with gr.Row():
+                kernel_name = gr.Textbox(
+                    label="Kernel Name",
+                    value="reduction",
+                    placeholder="e.g. reduction, matmul, vector_add",
+                    scale=2,
+                )
+                simple_mode = gr.Checkbox(
+                    label="Explain Like I'm 5",
+                    value=False,
+                    scale=1,
+                )
+            with gr.Row():
+                port_btn = gr.Button("⚡ Port to ROCm", variant="primary", scale=3)
+                clear_btn = gr.Button("🗑 Clear", scale=1)
+            gr.Examples(
+                examples=[
+                    [EXAMPLE_REDUCTION, "reduction", False],
+                    [EXAMPLE_VECTOR_ADD, "vector_add", False],
+                    [EXAMPLE_MATMUL, "matmul", False],
+                ],
+                inputs=[cuda_input, kernel_name, simple_mode],
+                label="Demo Kernels (pre-loaded with intentional AMD bugs)",
+            )
+        # ── Right: output ─────────────────────────────────────────────────────
+        with gr.Column(scale=1):
+            gr.Markdown("### 📤 Output", elem_classes="panel-header")
+            log_output = gr.Markdown(
+                value="*Agent steps will appear here once you click **Port to ROCm**.*",
+                label="Agent Pipeline Log",
+            )
+            hip_output = gr.Code(
+                label="Translated & Optimized HIP Code",
+                language="c++",
+                lines=18,
+            )
+    gr.Markdown(
+        """
+---
+### How the pipeline works
+| Agent | Role |
+|-------|------|
+| 🔍 **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout |
+| 🔄 **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect |
+| 🧪 **Tester** | Verifies compilation with `hipcc` and checks output correctness |
+| ⚡ **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline |
+| 🎯 **Coordinator** | Orchestrates the loop; retries up to 3× if the optimised output regresses |
+### The key bug: warp-size assumption
+```c
+// NVIDIA (warpSize = 32) — silently WRONG on AMD
+if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... }
+// AMD-correct (wavefront = 64)
+if (tid < 64) {
+    vsmem[tid] += vsmem[tid + 32];
+    if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... }
+}
+```
+### Benchmark highlights (MI300X, ROCm 7.0)
+| Kernel | Result |
+|--------|--------|
+| matrix_multiply 512×512 | 2.91× speedup over baseline HIP |
+| vector_add 32 M elements | ~3 918 GB/s (~74 % of MI300X peak) |
+| reduction 16 M elements | correctness PASS after wavefront-64 fix |
+> Source: `docs/benchmark_runs/` — real `rocprof` CSV output, May 2026.
+> Results vary with kernel complexity; these figures are not guaranteed on every input.
+"""
+    )
+    # ── Event wiring ──────────────────────────────────────────────────────────
+    port_btn.click(
+        fn=port_kernel,
+        inputs=[cuda_input, kernel_name, simple_mode],
+        outputs=[log_output, hip_output],
+    )
+    clear_btn.click(
+        fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""),
+        outputs=[log_output, hip_output],
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ httpx==0.27.2