docs fix
Browse files- Dockerfile +3 -1
- README.md +16 -11
- backend/tools/static_analyzer.py +19 -1
- dataset/finetune_qwen.py +22 -9
- dataset/requirements-finetune.txt +6 -0
- docs/FAILURE_CASES.md +3 -2
- docs/JUDGE_MODE.md +36 -30
- docs/LIVE_RESULTS.md +22 -32
- docs/benchmark_runs/mi300x_results.txt +25 -14
Dockerfile
CHANGED
|
@@ -5,11 +5,13 @@ RUN npm ci
|
|
| 5 |
COPY frontend/ ./
|
| 6 |
RUN npm run build
|
| 7 |
|
| 8 |
-
FROM rocm/dev-ubuntu-22.04:
|
| 9 |
WORKDIR /app
|
| 10 |
COPY backend/requirements.txt .
|
| 11 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
COPY . .
|
| 13 |
COPY --from=frontend-build /app/frontend/dist ./frontend/dist
|
|
|
|
|
|
|
| 14 |
EXPOSE 8000
|
| 15 |
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
|
| 5 |
COPY frontend/ ./
|
| 6 |
RUN npm run build
|
| 7 |
|
| 8 |
+
FROM rocm/dev-ubuntu-22.04:7.2.2-complete
|
| 9 |
WORKDIR /app
|
| 10 |
COPY backend/requirements.txt .
|
| 11 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 12 |
COPY . .
|
| 13 |
COPY --from=frontend-build /app/frontend/dist ./frontend/dist
|
| 14 |
+
# Runtime envs: GROQ_API_KEY, ROCM_AVAILABLE, HIPCC_PATH, ROCPROF_PATH.
|
| 15 |
+
# Pass secrets at docker run/deploy time; do not bake .env into the image.
|
| 16 |
EXPOSE 8000
|
| 17 |
CMD ["uvicorn", "backend.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -52,17 +52,17 @@ If the optimized output underperforms the baseline, the coordinator retries the
|
|
| 52 |
|
| 53 |
---
|
| 54 |
|
| 55 |
-
##
|
| 56 |
|
| 57 |
-
|
| 58 |
|
| 59 |
| Kernel | Input | Baseline HIP | Optimized HIP | Result |
|
| 60 |
|--------|-------|-------------|---------------|--------|
|
| 61 |
-
| matrix_multiply |
|
| 62 |
-
| reduction |
|
| 63 |
-
| vector_add |
|
| 64 |
|
| 65 |
-
Hardware: AMD Instinct MI300X
|
| 66 |
|
| 67 |
---
|
| 68 |
|
|
@@ -164,14 +164,19 @@ start.bat
|
|
| 164 |
./start.sh
|
| 165 |
|
| 166 |
# Manual
|
| 167 |
-
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
| 169 |
cp .env.example .env
|
| 170 |
# Add GROQ_API_KEY
|
| 171 |
-
|
|
|
|
|
|
|
| 172 |
```
|
| 173 |
|
| 174 |
-
Open `
|
| 175 |
|
| 176 |
### Docker
|
| 177 |
|
|
@@ -255,4 +260,4 @@ ROCmPort AI/
|
|
| 255 |
|
| 256 |
## License
|
| 257 |
|
| 258 |
-
Apache 2.0 — see [`LICENSE`](LICENSE)
|
|
|
|
| 52 |
|
| 53 |
---
|
| 54 |
|
| 55 |
+
## Reproducible Demo Results
|
| 56 |
|
| 57 |
+
These numbers are deterministic `demo_artifact` values returned by the backend when `ROCM_AVAILABLE=false`. Set `ROCM_AVAILABLE=true` on real MI300X hardware to collect `data_source=real_rocm` results.
|
| 58 |
|
| 59 |
| Kernel | Input | Baseline HIP | Optimized HIP | Result |
|
| 60 |
|--------|-------|-------------|---------------|--------|
|
| 61 |
+
| matrix_multiply | demo artifact | 121.4ms | 89.1ms | **1.36x speedup** |
|
| 62 |
+
| reduction | demo artifact | 88.2ms | 68.7ms | **1.28x speedup** |
|
| 63 |
+
| vector_add | demo artifact | 45.1ms | 38.2ms | **1.18x speedup** |
|
| 64 |
|
| 65 |
+
Hardware class: AMD Instinct MI300X, 192GB HBM3, wavefront=64
|
| 66 |
|
| 67 |
---
|
| 68 |
|
|
|
|
| 164 |
./start.sh
|
| 165 |
|
| 166 |
# Manual
|
| 167 |
+
python -m venv .venv
|
| 168 |
+
# Windows: .venv\Scripts\activate
|
| 169 |
+
# Linux/Mac:
|
| 170 |
+
. .venv/bin/activate
|
| 171 |
+
pip install -r backend/requirements.txt
|
| 172 |
cp .env.example .env
|
| 173 |
# Add GROQ_API_KEY
|
| 174 |
+
npm --prefix frontend install
|
| 175 |
+
npm --prefix frontend run build
|
| 176 |
+
python -m uvicorn backend.main:app --reload --port 8000
|
| 177 |
```
|
| 178 |
|
| 179 |
+
Open `http://localhost:8000/index.html` in a browser.
|
| 180 |
|
| 181 |
### Docker
|
| 182 |
|
|
|
|
| 260 |
|
| 261 |
## License
|
| 262 |
|
| 263 |
+
Apache 2.0 — see [`LICENSE`](LICENSE)
|
backend/tools/static_analyzer.py
CHANGED
|
@@ -58,6 +58,15 @@ _PATTERNS: List[tuple] = [
|
|
| 58 |
"Replace __ballot_sync(0xffffffff, cond) with __ballot(cond). "
|
| 59 |
"The return type changes from uint32_t to uint64_t — update downstream bitmask logic."
|
| 60 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
(
|
| 62 |
"activemask_warp",
|
| 63 |
re.compile(r'\b__activemask\s*\(\s*\)', re.MULTILINE),
|
|
@@ -86,7 +95,7 @@ _PATTERNS: List[tuple] = [
|
|
| 86 |
(
|
| 87 |
"inline_ptx_block",
|
| 88 |
re.compile(r'asm\s+volatile\s*\(', re.MULTILINE),
|
| 89 |
-
"
|
| 90 |
"Inline PTX assembly is NVIDIA-specific ISA. hipify cannot translate PTX semantics. "
|
| 91 |
"The kernel may compile under hipcc but will have undefined or incorrect behaviour.",
|
| 92 |
"Replace inline PTX with portable HIP intrinsics or CDNA ISA equivalents. "
|
|
@@ -101,6 +110,15 @@ _PATTERNS: List[tuple] = [
|
|
| 101 |
"Replace with #include <hip/hip_runtime.h>. "
|
| 102 |
"hipify-clang does this automatically in its first pass."
|
| 103 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
(
|
| 105 |
"shared_memory_no_padding",
|
| 106 |
re.compile(r'__shared__\s+\w+\s+\w+\s*\[\s*\d+\s*\]', re.MULTILINE),
|
|
|
|
| 58 |
"Replace __ballot_sync(0xffffffff, cond) with __ballot(cond). "
|
| 59 |
"The return type changes from uint32_t to uint64_t — update downstream bitmask logic."
|
| 60 |
),
|
| 61 |
+
(
|
| 62 |
+
"shfl_wavefront_offset_16",
|
| 63 |
+
re.compile(r'\b__shfl(?:_down|_up|_xor)?\s*\([^;]*,\s*16\s*(?:,|\))', re.MULTILINE),
|
| 64 |
+
"HIGH",
|
| 65 |
+
"__shfl* with offset=16 often encodes a 32-lane warp reduction tail. "
|
| 66 |
+
"On AMD wavefront=64 the reduction should include an offset=32 step first.",
|
| 67 |
+
"Audit the shuffle reduction and add a wavefront-64 step, e.g. offset=32 "
|
| 68 |
+
"before offset=16 where the algorithm reduces a full wavefront."
|
| 69 |
+
),
|
| 70 |
(
|
| 71 |
"activemask_warp",
|
| 72 |
re.compile(r'\b__activemask\s*\(\s*\)', re.MULTILINE),
|
|
|
|
| 95 |
(
|
| 96 |
"inline_ptx_block",
|
| 97 |
re.compile(r'asm\s+volatile\s*\(', re.MULTILINE),
|
| 98 |
+
"CRITICAL",
|
| 99 |
"Inline PTX assembly is NVIDIA-specific ISA. hipify cannot translate PTX semantics. "
|
| 100 |
"The kernel may compile under hipcc but will have undefined or incorrect behaviour.",
|
| 101 |
"Replace inline PTX with portable HIP intrinsics or CDNA ISA equivalents. "
|
|
|
|
| 110 |
"Replace with #include <hip/hip_runtime.h>. "
|
| 111 |
"hipify-clang does this automatically in its first pass."
|
| 112 |
),
|
| 113 |
+
(
|
| 114 |
+
"cuda_library_dependency",
|
| 115 |
+
re.compile(r'#\s*include\s*[<"][^>"]*(?:cub|thrust|cudnn)[^>"]*[>"]|\b(?:cub|thrust|cudnn)::', re.MULTILINE),
|
| 116 |
+
"HIGH",
|
| 117 |
+
"CUDA library dependency detected. hipify can rename some CUB/Thrust/cuDNN symbols, "
|
| 118 |
+
"but API coverage and performance behavior are not guaranteed to match rocPRIM/hipCUB/MIOpen.",
|
| 119 |
+
"Manually review the translated library call, compare against rocPRIM/hipCUB/MIOpen, "
|
| 120 |
+
"and add correctness/performance tests for the specific primitive."
|
| 121 |
+
),
|
| 122 |
(
|
| 123 |
"shared_memory_no_padding",
|
| 124 |
re.compile(r'__shared__\s+\w+\s+\w+\s*\[\s*\d+\s*\]', re.MULTILINE),
|
dataset/finetune_qwen.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
| 1 |
# finetune_qwen.py
|
|
|
|
|
|
|
| 2 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
| 3 |
from peft import LoraConfig, get_peft_model, TaskType
|
| 4 |
from trl import SFTTrainer
|
|
@@ -8,7 +10,12 @@ import torch
|
|
| 8 |
MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 9 |
DATASET = "tazwarrrr/cuda-to-rocm-wavefront-bugs"
|
| 10 |
OUTPUT = "/workspace/rocmport-qwen-finetuned"
|
| 11 |
-
HF_TOKEN = "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Load dataset
|
| 14 |
ds = load_dataset(DATASET)
|
|
@@ -32,15 +39,21 @@ def format_example(example):
|
|
| 32 |
|
| 33 |
|
| 34 |
formatted = ds.map(format_example)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# Load model
|
| 37 |
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
|
| 38 |
model = AutoModelForCausalLM.from_pretrained(
|
| 39 |
MODEL,
|
| 40 |
-
torch_dtype=torch.
|
| 41 |
-
device_map="auto",
|
| 42 |
trust_remote_code=True
|
| 43 |
)
|
|
|
|
|
|
|
| 44 |
|
| 45 |
# LoRA config
|
| 46 |
lora_config = LoraConfig(
|
|
@@ -61,7 +74,8 @@ args = TrainingArguments(
|
|
| 61 |
gradient_accumulation_steps=4,
|
| 62 |
warmup_steps=10,
|
| 63 |
learning_rate=2e-4,
|
| 64 |
-
|
|
|
|
| 65 |
logging_steps=5,
|
| 66 |
save_strategy="epoch",
|
| 67 |
report_to="none"
|
|
@@ -70,7 +84,7 @@ args = TrainingArguments(
|
|
| 70 |
trainer = SFTTrainer(
|
| 71 |
model=model,
|
| 72 |
tokenizer=tokenizer,
|
| 73 |
-
train_dataset=
|
| 74 |
dataset_text_field="text",
|
| 75 |
max_seq_length=2048,
|
| 76 |
args=args
|
|
@@ -80,8 +94,7 @@ trainer.train()
|
|
| 80 |
trainer.save_model(OUTPUT)
|
| 81 |
|
| 82 |
# Push to HuggingFace
|
| 83 |
-
model.
|
| 84 |
-
|
| 85 |
-
tokenizer.push_to_hub(
|
| 86 |
-
token=HF_TOKEN)
|
| 87 |
print("Done. Model pushed to HuggingFace.")
|
|
|
|
| 1 |
# finetune_qwen.py
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
|
| 5 |
from peft import LoraConfig, get_peft_model, TaskType
|
| 6 |
from trl import SFTTrainer
|
|
|
|
| 10 |
MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
|
| 11 |
DATASET = "tazwarrrr/cuda-to-rocm-wavefront-bugs"
|
| 12 |
OUTPUT = "/workspace/rocmport-qwen-finetuned"
|
| 13 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 14 |
+
if not HF_TOKEN:
|
| 15 |
+
raise RuntimeError("Set HF_TOKEN in the environment before running fine-tuning.")
|
| 16 |
+
REPO_ID = "tazwarrrr/rocmport-qwen-wavefront-finetuned"
|
| 17 |
+
|
| 18 |
+
os.makedirs(OUTPUT, exist_ok=True)
|
| 19 |
|
| 20 |
# Load dataset
|
| 21 |
ds = load_dataset(DATASET)
|
|
|
|
| 39 |
|
| 40 |
|
| 41 |
formatted = ds.map(format_example)
|
| 42 |
+
if hasattr(formatted, "keys"):
|
| 43 |
+
train_split = "train" if "train" in formatted else next(iter(formatted.keys()))
|
| 44 |
+
train_dataset = formatted[train_split]
|
| 45 |
+
else:
|
| 46 |
+
train_dataset = formatted
|
| 47 |
|
| 48 |
# Load model
|
| 49 |
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
|
| 50 |
model = AutoModelForCausalLM.from_pretrained(
|
| 51 |
MODEL,
|
| 52 |
+
torch_dtype=torch.bfloat16,
|
|
|
|
| 53 |
trust_remote_code=True
|
| 54 |
)
|
| 55 |
+
if torch.cuda.is_available():
|
| 56 |
+
model.to("cuda")
|
| 57 |
|
| 58 |
# LoRA config
|
| 59 |
lora_config = LoraConfig(
|
|
|
|
| 74 |
gradient_accumulation_steps=4,
|
| 75 |
warmup_steps=10,
|
| 76 |
learning_rate=2e-4,
|
| 77 |
+
bf16=torch.cuda.is_available(),
|
| 78 |
+
fp16=False,
|
| 79 |
logging_steps=5,
|
| 80 |
save_strategy="epoch",
|
| 81 |
report_to="none"
|
|
|
|
| 84 |
trainer = SFTTrainer(
|
| 85 |
model=model,
|
| 86 |
tokenizer=tokenizer,
|
| 87 |
+
train_dataset=train_dataset,
|
| 88 |
dataset_text_field="text",
|
| 89 |
max_seq_length=2048,
|
| 90 |
args=args
|
|
|
|
| 94 |
trainer.save_model(OUTPUT)
|
| 95 |
|
| 96 |
# Push to HuggingFace
|
| 97 |
+
merged_model = model.merge_and_unload()
|
| 98 |
+
merged_model.push_to_hub(REPO_ID, token=HF_TOKEN)
|
| 99 |
+
tokenizer.push_to_hub(REPO_ID, token=HF_TOKEN)
|
|
|
|
| 100 |
print("Done. Model pushed to HuggingFace.")
|
dataset/requirements-finetune.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
accelerate==0.34.2
|
| 2 |
+
datasets==3.1.0
|
| 3 |
+
peft==0.13.2
|
| 4 |
+
torch
|
| 5 |
+
transformers==4.46.3
|
| 6 |
+
trl==0.9.6
|
docs/FAILURE_CASES.md
CHANGED
|
@@ -70,8 +70,9 @@ cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_item
|
|
| 70 |
**What hipify does**: renames cudaFree to hipFree, cuda headers to hip headers.
|
| 71 |
Does NOT fix the shuffle semantics.
|
| 72 |
|
| 73 |
-
**What ROCmPort AI does**: flags
|
| 74 |
-
|
|
|
|
| 75 |
|
| 76 |
**Status**: Compiled and executed on AMD Instinct MI300X (gfx942), ROCm 7.2.
|
| 77 |
Numerical correctness not verified — requires reference CPU implementation.
|
|
|
|
| 70 |
**What hipify does**: renames cudaFree to hipFree, cuda headers to hip headers.
|
| 71 |
Does NOT fix the shuffle semantics.
|
| 72 |
|
| 73 |
+
**What ROCmPort AI does**: flags `__shfl_sync` family calls as CRITICAL risk,
|
| 74 |
+
and flags unsuffixed `__shfl_down(..., 16)` style reductions as HIGH risk.
|
| 75 |
+
It identifies the offset=16 assumption and suggests a wavefront-64 aware rewrite.
|
| 76 |
|
| 77 |
**Status**: Compiled and executed on AMD Instinct MI300X (gfx942), ROCm 7.2.
|
| 78 |
Numerical correctness not verified — requires reference CPU implementation.
|
docs/JUDGE_MODE.md
CHANGED
|
@@ -1,42 +1,48 @@
|
|
| 1 |
# Judge Mode Walkthrough
|
| 2 |
|
| 3 |
-
Use this sequence during technical evaluation
|
|
|
|
| 4 |
|
| 5 |
## Goal
|
| 6 |
-
|
|
|
|
| 7 |
|
| 8 |
## Flow
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
|
| 17 |
## Baseline Policy
|
|
|
|
| 18 |
- Primary baseline: straight hipify output with minimal required compile edits.
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
-
|
| 26 |
-
-
|
| 27 |
-
-
|
| 28 |
-
- Final
|
| 29 |
-
|
| 30 |
-
##
|
| 31 |
-
|
| 32 |
-
-
|
| 33 |
-
-
|
| 34 |
-
-
|
| 35 |
-
- "Here is a case where gain is limited, and why."
|
| 36 |
|
| 37 |
## Pass/Fail Criteria
|
|
|
|
| 38 |
A demo is credible if:
|
| 39 |
-
|
| 40 |
-
-
|
| 41 |
-
-
|
| 42 |
-
-
|
|
|
|
|
|
| 1 |
# Judge Mode Walkthrough
|
| 2 |
|
| 3 |
+
Use this sequence during technical evaluation with the current React UI and
|
| 4 |
+
FastAPI SSE stream.
|
| 5 |
|
| 6 |
## Goal
|
| 7 |
+
|
| 8 |
+
Make every claim falsifiable and tied to fields returned by the backend.
|
| 9 |
|
| 10 |
## Flow
|
| 11 |
+
|
| 12 |
+
1. Open `http://localhost:8000/index.html`.
|
| 13 |
+
2. Choose or paste a CUDA kernel.
|
| 14 |
+
3. Run ROCmPort AI and watch the five agent cards:
|
| 15 |
+
analyzer, translator, optimizer, tester, coordinator.
|
| 16 |
+
4. Confirm the tester event reports speedup, bandwidth, bottleneck, and data source.
|
| 17 |
+
5. Confirm the coordinator event produces the final report JSON in its SSE `detail`.
|
| 18 |
+
6. Use `/benchmark-report` for reproducible demo-artifact metrics and data-source labels.
|
| 19 |
+
7. Show a limited-gain case such as `vector_add` and explain the bandwidth-bound result.
|
| 20 |
|
| 21 |
## Baseline Policy
|
| 22 |
+
|
| 23 |
- Primary baseline: straight hipify output with minimal required compile edits.
|
| 24 |
+
- Demo-mode baselines come from `backend/tools/demo_artifacts.py`.
|
| 25 |
+
- Real hardware baselines require `ROCM_AVAILABLE=true` and captured `hipcc`/`rocprof` logs.
|
| 26 |
+
- Never mix `demo_artifact` and `real_rocm` numbers in the same result table.
|
| 27 |
+
|
| 28 |
+
## Visible Artifacts In Current UI
|
| 29 |
+
|
| 30 |
+
- CUDA source input.
|
| 31 |
+
- Agent event stream.
|
| 32 |
+
- Tester summary: execution time, bandwidth utilization, bottleneck, notes.
|
| 33 |
+
- Final summary footer: changes made, critical bugs found, compile/migration success, data source.
|
| 34 |
+
|
| 35 |
+
## Additional Artifacts Available By API
|
| 36 |
+
|
| 37 |
+
- `/benchmark-report`: reproducible benchmark summary and static risk scans.
|
| 38 |
+
- `/export`: migration diff, original CUDA, optimized HIP, and report markdown.
|
| 39 |
+
- `/demo-kernels`: source for bundled demo kernels.
|
|
|
|
| 40 |
|
| 41 |
## Pass/Fail Criteria
|
| 42 |
+
|
| 43 |
A demo is credible if:
|
| 44 |
+
|
| 45 |
+
- Every speedup is tied to its `data_source`.
|
| 46 |
+
- The baseline definition is stated before showing speedup.
|
| 47 |
+
- Static risk findings match the analyzer event or `/benchmark-report`.
|
| 48 |
+
- At least one non-perfect or limited-gain case is included.
|
docs/LIVE_RESULTS.md
CHANGED
|
@@ -1,40 +1,30 @@
|
|
| 1 |
-
#
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
|
| 6 |
## Benchmark Results
|
| 7 |
|
| 8 |
-
| Kernel |
|
| 9 |
-
|--------|------------
|
| 10 |
-
| matrix_multiply |
|
| 11 |
-
| reduction |
|
| 12 |
-
| vector_add |
|
|
|
|
| 13 |
|
| 14 |
-
## Hardware
|
| 15 |
|
| 16 |
-
-
|
| 17 |
-
-
|
| 18 |
-
-
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
-
- **data_source**: real_rocm
|
| 22 |
|
| 23 |
-
##
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
**reduction**: AMD wavefront-64 aware final stage produces correct results.
|
| 31 |
-
The original CUDA kernel with hardcoded warp-32 assumption silently skips
|
| 32 |
-
lanes 32-63 and returns a wrong sum. ROCmPort AI catches this at static
|
| 33 |
-
scan before any compilation attempt.
|
| 34 |
-
|
| 35 |
-
**vector_add**: 4077.6 GB/s achieved on a memory-bound kernel — 77% of
|
| 36 |
-
MI300X's 5.3 TB/s theoretical HBM3 peak. This demonstrates the bandwidth
|
| 37 |
-
advantage of MI300X over H100 (3.35 TB/s peak) for memory-bound workloads.
|
| 38 |
-
|
| 39 |
-
## Correctness Verification
|
| 40 |
-
All kernels executed without runtime errors on gfx942.
|
|
|
|
| 1 |
+
# Reproducible Results
|
| 2 |
|
| 3 |
+
The backend returns deterministic benchmark artifacts unless `ROCM_AVAILABLE=true`
|
| 4 |
+
is set on real ROCm hardware. These values come from
|
| 5 |
+
`backend/tools/demo_artifacts.py` and are labelled `data_source="demo_artifact"`
|
| 6 |
+
in API responses.
|
| 7 |
|
| 8 |
## Benchmark Results
|
| 9 |
|
| 10 |
+
| Kernel | Baseline HIP (ms) | Optimized HIP (ms) | Speedup | Bandwidth | Bottleneck |
|
| 11 |
+
|--------|-------------------|--------------------|---------|-----------|------------|
|
| 12 |
+
| matrix_multiply | 121.4 | 89.1 | 1.36x | 1843.7 GB/s | memory-bound |
|
| 13 |
+
| reduction | 88.2 | 68.7 | 1.28x | 531.8 GB/s | compute-bound after wavefront fix |
|
| 14 |
+
| vector_add | 45.1 | 38.2 | 1.18x | 4821.6 GB/s | memory-bound |
|
| 15 |
+
| convolution_2d | 211.7 | 158.3 | 1.34x | 2134.8 GB/s | memory-bound |
|
| 16 |
|
| 17 |
+
## Hardware Context
|
| 18 |
|
| 19 |
+
- GPU class: AMD Instinct MI300X
|
| 20 |
+
- VRAM: 192GB HBM3
|
| 21 |
+
- Theoretical memory bandwidth: 5.3 TB/s
|
| 22 |
+
- Wavefront size: 64
|
| 23 |
+
- API data source in local/demo mode: `demo_artifact`
|
|
|
|
| 24 |
|
| 25 |
+
## Real Hardware Mode
|
| 26 |
|
| 27 |
+
Set `ROCM_AVAILABLE=true`, `HIPCC_PATH=hipcc`, and `ROCPROF_PATH=rocprof` on a
|
| 28 |
+
real MI300X ROCm environment to replace demo artifacts with `data_source="real_rocm"`.
|
| 29 |
+
Real run output should be captured separately with the exact ROCm version, kernel
|
| 30 |
+
input size, compiler flags, and profiler logs.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs/benchmark_runs/mi300x_results.txt
CHANGED
|
@@ -1,17 +1,28 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
Date: 2025-05-06
|
| 4 |
-
Compiler: hipcc --offload-arch=gfx942 -O3
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Data source: demo_artifact
|
| 2 |
+
Source file: backend/tools/demo_artifacts.py
|
|
|
|
|
|
|
| 3 |
|
| 4 |
+
Hardware class:
|
| 5 |
+
GPU: AMD Instinct MI300X
|
| 6 |
+
HBM: 192GB
|
| 7 |
+
Wavefront size: 64
|
| 8 |
+
Theoretical memory bandwidth: 5.3 TB/s
|
| 9 |
|
| 10 |
+
matrix_multiply:
|
| 11 |
+
Baseline HIP: 121.4 ms
|
| 12 |
+
Optimized HIP: 89.1 ms
|
| 13 |
+
Speedup: 1.36x
|
| 14 |
+
Bandwidth: 1843.7 GB/s
|
| 15 |
|
| 16 |
+
reduction:
|
| 17 |
+
Baseline HIP: 88.2 ms
|
| 18 |
+
Optimized HIP: 68.7 ms
|
| 19 |
+
Speedup: 1.28x
|
| 20 |
+
Bandwidth: 531.8 GB/s
|
| 21 |
+
|
| 22 |
+
vector_add:
|
| 23 |
+
Baseline HIP: 45.1 ms
|
| 24 |
+
Optimized HIP: 38.2 ms
|
| 25 |
+
Speedup: 1.18x
|
| 26 |
+
Bandwidth: 4821.6 GB/s
|
| 27 |
+
|
| 28 |
+
Set ROCM_AVAILABLE=true on real MI300X hardware to produce real_rocm values.
|