tazwarrrr commited on
Commit
944f300
·
1 Parent(s): 0bfb9d6

Add Gradio Space: app.py and requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +306 -0
  2. requirements.txt +1 -0
app.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ROCmPort AI — Gradio Space entry point
3
+ Calls the deployed FastAPI backend (Render) and streams agent events.
4
+ """
5
+
6
+ import gradio as gr
7
+ import httpx
8
+ import json
9
+
10
+ BACKEND_URL = "https://rocmport-ai-q2b1.onrender.com"
11
+
12
+ AGENT_ICONS = {
13
+ "analyzer": "🔍",
14
+ "translator": "🔄",
15
+ "optimizer": "⚡",
16
+ "tester": "🧪",
17
+ "coordinator": "🎯",
18
+ }
19
+
20
+ STATUS_ICONS = {
21
+ "waiting": "⏳",
22
+ "running": "🔄",
23
+ "done": "✅",
24
+ "failed": "❌",
25
+ "retrying": "🔁",
26
+ }
27
+
28
+ EXAMPLE_REDUCTION = """\
29
+ __global__ void reduction_kernel(float* g_idata, float* g_odata, unsigned int n) {
30
+ extern __shared__ float sdata[];
31
+ unsigned int tid = threadIdx.x;
32
+ unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
33
+ float mySum = (i < n) ? g_idata[i] : 0;
34
+ if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
35
+ sdata[tid] = mySum;
36
+ __syncthreads();
37
+ for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
38
+ if (tid < s) sdata[tid] = mySum = mySum + sdata[tid + s];
39
+ __syncthreads();
40
+ }
41
+ // DELIBERATE BUG: assumes warpSize=32, wrong on AMD (warpSize=64)
42
+ if (tid < 32) {
43
+ volatile float* vsmem = sdata;
44
+ vsmem[tid] = mySum = mySum + vsmem[tid + 32];
45
+ vsmem[tid] = mySum = mySum + vsmem[tid + 16];
46
+ vsmem[tid] = mySum = mySum + vsmem[tid + 8];
47
+ vsmem[tid] = mySum = mySum + vsmem[tid + 4];
48
+ vsmem[tid] = mySum = mySum + vsmem[tid + 2];
49
+ vsmem[tid] = mySum = mySum + vsmem[tid + 1];
50
+ }
51
+ if (tid == 0) g_odata[blockIdx.x] = sdata[0];
52
+ }"""
53
+
54
+ EXAMPLE_VECTOR_ADD = """\
55
+ __global__ void vectorAdd(const float *A, const float *B, float *C, int n) {
56
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
57
+ if (i < n) {
58
+ C[i] = A[i] + B[i];
59
+ // Warp-size assumption: 32 threads per warp (wrong on AMD wavefront-64)
60
+ if (threadIdx.x % 32 == 0) {
61
+ printf("Warp leader: %d\\n", threadIdx.x / 32);
62
+ }
63
+ }
64
+ }"""
65
+
66
+ EXAMPLE_MATMUL = """\
67
+ __global__ void matmul(float *A, float *B, float *C, int N) {
68
+ __shared__ float As[32][32];
69
+ __shared__ float Bs[32][32];
70
+ int row = blockIdx.y * 32 + threadIdx.y;
71
+ int col = blockIdx.x * 32 + threadIdx.x;
72
+ float sum = 0.0f;
73
+ for (int k = 0; k < N / 32; k++) {
74
+ As[threadIdx.y][threadIdx.x] = A[row * N + k * 32 + threadIdx.x];
75
+ Bs[threadIdx.y][threadIdx.x] = B[(k * 32 + threadIdx.y) * N + col];
76
+ __syncthreads();
77
+ for (int n = 0; n < 32; n++)
78
+ sum += As[threadIdx.y][n] * Bs[n][threadIdx.x];
79
+ __syncthreads();
80
+ }
81
+ C[row * N + col] = sum;
82
+ }"""
83
+
84
+
85
+ def port_kernel(cuda_code: str, kernel_name: str, simple_mode: bool):
86
+ """Generator: streams agent events and yields (log_markdown, hip_code)."""
87
+ if not cuda_code or len(cuda_code.strip()) < 10:
88
+ yield "❌ Please provide CUDA kernel code (at least 10 characters).", ""
89
+ return
90
+
91
+ kernel_name = kernel_name.strip() or "custom"
92
+ log_lines: list[str] = []
93
+ hip_code = ""
94
+
95
+ payload = {
96
+ "cuda_code": cuda_code,
97
+ "kernel_name": kernel_name,
98
+ "simple_mode": bool(simple_mode),
99
+ }
100
+
101
+ log_lines.append("🚀 **Connecting to ROCmPort AI backend…**")
102
+ yield "\n\n".join(log_lines), hip_code
103
+
104
+ try:
105
+ with httpx.Client(timeout=180.0) as client:
106
+ with client.stream("POST", f"{BACKEND_URL}/port", json=payload) as resp:
107
+ resp.raise_for_status()
108
+
109
+ for line in resp.iter_lines():
110
+ if not line:
111
+ continue
112
+ if not line.startswith("data: "):
113
+ continue
114
+
115
+ data = line[6:]
116
+ if data.strip() == "[DONE]":
117
+ break
118
+
119
+ try:
120
+ event = json.loads(data)
121
+ except json.JSONDecodeError:
122
+ continue
123
+
124
+ agent = event.get("agent", "system")
125
+ status = event.get("status", "running")
126
+ message = event.get("message", "")
127
+ detail = event.get("detail") or ""
128
+
129
+ icon = AGENT_ICONS.get(agent, "🤖")
130
+ s_icon = STATUS_ICONS.get(status, "•")
131
+
132
+ log_lines.append(f"{icon} **{agent.capitalize()}** {s_icon} — {message}")
133
+
134
+ # Extract HIP code from coordinator or translator done events
135
+ if status == "done" and detail:
136
+ try:
137
+ detail_json = json.loads(detail)
138
+ candidate = (
139
+ detail_json.get("hip_code")
140
+ or detail_json.get("optimized_code")
141
+ or detail_json.get("translated_code")
142
+ or ""
143
+ )
144
+ if candidate:
145
+ hip_code = candidate
146
+ except (json.JSONDecodeError, AttributeError):
147
+ pass
148
+
149
+ yield "\n\n".join(log_lines), hip_code
150
+
151
+ except httpx.ConnectError:
152
+ log_lines.append(
153
+ "❌ **Could not connect to backend.**\n\n"
154
+ "> The server may be in a cold-start state — please wait ~30 s and retry."
155
+ )
156
+ yield "\n\n".join(log_lines), hip_code
157
+ return
158
+ except httpx.TimeoutException:
159
+ log_lines.append("⏱️ **Request timed out.** The pipeline may still be running — try again shortly.")
160
+ yield "\n\n".join(log_lines), hip_code
161
+ return
162
+ except httpx.HTTPStatusError as exc:
163
+ log_lines.append(f"❌ **HTTP {exc.response.status_code}**: {exc.response.text[:300]}")
164
+ yield "\n\n".join(log_lines), hip_code
165
+ return
166
+ except Exception as exc: # noqa: BLE001
167
+ log_lines.append(f"❌ **Unexpected error**: {exc}")
168
+ yield "\n\n".join(log_lines), hip_code
169
+ return
170
+
171
+ if not hip_code:
172
+ log_lines.append("\n⚠️ Pipeline finished but no HIP code was extracted. Check agent logs above.")
173
+ else:
174
+ log_lines.append("\n✅ **Migration complete.** HIP code is shown on the right →")
175
+
176
+ yield "\n\n".join(log_lines), hip_code
177
+
178
+
179
+ # ── UI ────────────────────────────────────────────────────────────────────────
180
+
181
+ CSS = """
182
+ .panel-header { font-weight: 600; font-size: 1rem; margin-bottom: 4px; }
183
+ footer { display: none !important; }
184
+ """
185
+
186
+ with gr.Blocks(
187
+ title="ROCmPort AI — CUDA → ROCm Migration",
188
+ theme=gr.themes.Default(primary_hue="orange"),
189
+ css=CSS,
190
+ ) as demo:
191
+
192
+ gr.Markdown(
193
+ """# ⚡ ROCmPort AI
194
+ ### Agentic CUDA → ROCm/HIP migration with wavefront-64 bug detection
195
+
196
+ > **Backend API**: [rocmport-ai-q2b1.onrender.com](https://rocmport-ai-q2b1.onrender.com) &nbsp;|&nbsp;
197
+ > **GitHub**: [tazwaryayyyy/ROCmPort-AI](https://github.com/tazwaryayyyy/ROCmPort-AI)
198
+
199
+ `hipify-clang` translates CUDA API calls mechanically — it **cannot** detect that `if (tid < 32)` in a
200
+ warp-level reduction silently skips lanes 32–63 on AMD wavefront-64.
201
+ The code compiles, the output is wrong, no errors. **ROCmPort AI catches this before execution.**
202
+ """
203
+ )
204
+
205
+ with gr.Row():
206
+ # ── Left: input ──────────────────────────────────────────────────────
207
+ with gr.Column(scale=1):
208
+ gr.Markdown("### 📥 Input", elem_classes="panel-header")
209
+ cuda_input = gr.Code(
210
+ label="CUDA Kernel Code",
211
+ language="c++",
212
+ lines=22,
213
+ value=EXAMPLE_REDUCTION,
214
+ )
215
+ with gr.Row():
216
+ kernel_name = gr.Textbox(
217
+ label="Kernel Name",
218
+ value="reduction",
219
+ placeholder="e.g. reduction, matmul, vector_add",
220
+ scale=2,
221
+ )
222
+ simple_mode = gr.Checkbox(
223
+ label="Explain Like I'm 5",
224
+ value=False,
225
+ scale=1,
226
+ )
227
+ with gr.Row():
228
+ port_btn = gr.Button("⚡ Port to ROCm", variant="primary", scale=3)
229
+ clear_btn = gr.Button("🗑 Clear", scale=1)
230
+
231
+ gr.Examples(
232
+ examples=[
233
+ [EXAMPLE_REDUCTION, "reduction", False],
234
+ [EXAMPLE_VECTOR_ADD, "vector_add", False],
235
+ [EXAMPLE_MATMUL, "matmul", False],
236
+ ],
237
+ inputs=[cuda_input, kernel_name, simple_mode],
238
+ label="Demo Kernels (pre-loaded with intentional AMD bugs)",
239
+ )
240
+
241
+ # ── Right: output ─────────────────────────────────────────────────────
242
+ with gr.Column(scale=1):
243
+ gr.Markdown("### 📤 Output", elem_classes="panel-header")
244
+ log_output = gr.Markdown(
245
+ value="*Agent steps will appear here once you click **Port to ROCm**.*",
246
+ label="Agent Pipeline Log",
247
+ )
248
+ hip_output = gr.Code(
249
+ label="Translated & Optimized HIP Code",
250
+ language="c++",
251
+ lines=18,
252
+ )
253
+
254
+ gr.Markdown(
255
+ """
256
+ ---
257
+ ### How the pipeline works
258
+
259
+ | Agent | Role |
260
+ |-------|------|
261
+ | 🔍 **Analyzer** | Scans CUDA for AMD-specific risks: wavefront size, ballot/shuffle idioms, shared-memory layout |
262
+ | 🔄 **Translator** | Runs `hipify` then applies LLM-guided fixes for bugs `hipify` cannot detect |
263
+ | 🧪 **Tester** | Verifies compilation with `hipcc` and checks output correctness |
264
+ | ⚡ **Optimizer** | Proposes MI300X-specific optimisations; re-tested against baseline |
265
+ | 🎯 **Coordinator** | Orchestrates the loop; retries up to 3× if the optimised output regresses |
266
+
267
+ ### The key bug: warp-size assumption
268
+
269
+ ```c
270
+ // NVIDIA (warpSize = 32) — silently WRONG on AMD
271
+ if (tid < 32) { vsmem[tid] += vsmem[tid + 32]; ... }
272
+
273
+ // AMD-correct (wavefront = 64)
274
+ if (tid < 64) {
275
+ vsmem[tid] += vsmem[tid + 32];
276
+ if (tid < 32) { vsmem[tid] += vsmem[tid + 16]; ... }
277
+ }
278
+ ```
279
+
280
+ ### Benchmark highlights (MI300X, ROCm 7.0)
281
+
282
+ | Kernel | Result |
283
+ |--------|--------|
284
+ | matrix_multiply 512×512 | 2.91× speedup over baseline HIP |
285
+ | vector_add 32 M elements | ~3 918 GB/s (~74 % of MI300X peak) |
286
+ | reduction 16 M elements | correctness PASS after wavefront-64 fix |
287
+
288
+ > Source: `docs/benchmark_runs/` — real `rocprof` CSV output, May 2026.
289
+ > Results vary with kernel complexity; these figures are not guaranteed on every input.
290
+ """
291
+ )
292
+
293
+ # ── Event wiring ──────────────────────────────────────────────────────────
294
+ port_btn.click(
295
+ fn=port_kernel,
296
+ inputs=[cuda_input, kernel_name, simple_mode],
297
+ outputs=[log_output, hip_output],
298
+ )
299
+
300
+ clear_btn.click(
301
+ fn=lambda: ("*Agent steps will appear here once you click **Port to ROCm**.*", ""),
302
+ outputs=[log_output, hip_output],
303
+ )
304
+
305
+ if __name__ == "__main__":
306
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ httpx==0.27.2