Spaces:
Running on A100
Running on A100
Initial Asset Harvester HF Space
Browse filesGradio app + Dockerfile for image-to-3D Gaussian splat pipeline.
Checkpoints are downloaded at runtime from nvidia/asset-harvester
via HF_TOKEN; not included in the repo.
This view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +4 -0
- .gitignore +11 -0
- Dockerfile +63 -0
- README.md +54 -10
- app.py +601 -0
- dist/asset_harvester-1.0.0-py3-none-any.whl +3 -0
- examples/VRU_pedestrians_0d7b602f2da8c364.jpeg +3 -0
- examples/VRU_pedestrians_723ce847bf6b1671.jpeg +3 -0
- examples/VRU_pedestrians_c2d728e02d4d11cc.jpeg +3 -0
- examples/automobile_00c7f5b5caa9e7d0.jpeg +3 -0
- examples/automobile_00e617a279b7f517.jpeg +3 -0
- examples/automobile_00e9ab349b437b2c.jpeg +3 -0
- examples/automobile_03271db9979f6072.jpeg +3 -0
- examples/automobile_039b7b7af4bd853b.jpeg +3 -0
- examples/automobile_044dfeb890d95741.jpeg +3 -0
- examples/automobile_04acf10a71d112a1.jpeg +3 -0
- examples/automobile_04cbe39ba786858d.jpeg +3 -0
- examples/automobile_05abef8311f6ca8c.jpeg +3 -0
- examples/automobile_0650ef1d75757b0e.jpeg +3 -0
- examples/automobile_0742aaf29c0a7090.jpeg +3 -0
- examples/automobile_07bf69847a2eae86.jpeg +3 -0
- examples/automobile_095cdc57d3186c66.jpeg +3 -0
- examples/automobile_0a5ccea0b758dd89.jpeg +3 -0
- examples/automobile_0d21d1c69e594ca7.jpeg +3 -0
- examples/automobile_0fc4baf8c34411e8.jpeg +3 -0
- examples/automobile_125e8d7a5a5ab518.jpeg +3 -0
- examples/automobile_13ee50f6c1e8e494.jpeg +3 -0
- examples/automobile_14030c6da90d58a8.jpeg +3 -0
- examples/automobile_14586bfbf8da0dd7.jpeg +3 -0
- examples/automobile_14b077380d557c2b.jpeg +3 -0
- examples/automobile_1585b4e264e88112.jpeg +3 -0
- examples/automobile_1704da3176d628b1.jpeg +3 -0
- examples/automobile_17289d1f0904c980.jpeg +3 -0
- examples/automobile_1875f1efbced2624.jpeg +3 -0
- examples/automobile_18f3d87e7b85d808.jpeg +3 -0
- examples/automobile_191dbee26f68e8ca.jpeg +3 -0
- examples/automobile_1a001d763cacdaa6.jpeg +3 -0
- examples/automobile_1b42127109e81f09.jpeg +3 -0
- examples/automobile_1bdd779f1bc8a22e.jpeg +3 -0
- examples/automobile_1ca3d3c4b08fa14a.jpeg +3 -0
- examples/automobile_1ee187f725a01351.jpeg +3 -0
- examples/automobile_2054eba237562446.jpeg +3 -0
- examples/automobile_23a0a9163760a5c1.jpeg +3 -0
- examples/automobile_2516c24ad21db02a.jpeg +3 -0
- examples/automobile_27a5512681e556e2.jpeg +3 -0
- examples/automobile_292e62704c0e9f28.jpeg +3 -0
- examples/automobile_29dfaf2cdbc2385a.jpeg +3 -0
- examples/automobile_2bf70aae9df266eb.jpeg +3 -0
- examples/automobile_2cab9afbbcc99c9e.jpeg +3 -0
- examples/automobile_30a50721545fbffe.jpeg +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.whl filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
checkpoints-cache/
|
| 2 |
+
checkpoints/
|
| 3 |
+
.client-venv/
|
| 4 |
+
.serve-venv/
|
| 5 |
+
.claude/
|
| 6 |
+
test_inputs/
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.pyc
|
| 9 |
+
*.pyo
|
| 10 |
+
.DS_Store
|
| 11 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
|
| 4 |
+
# ── Stage 1: Base system ─────────────────────────────────────────────
|
| 5 |
+
FROM nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 AS base
|
| 6 |
+
|
| 7 |
+
ENV DEBIAN_FRONTEND=noninteractive
|
| 8 |
+
ENV TZ=UTC
|
| 9 |
+
ENV PIP_NO_CACHE_DIR=1
|
| 10 |
+
|
| 11 |
+
RUN apt-get update && apt-get install -y \
|
| 12 |
+
python3 python3-pip python3-dev \
|
| 13 |
+
ffmpeg git \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
RUN ln -sf /usr/bin/python3 /usr/bin/python
|
| 17 |
+
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# ── Stage 2: Install asset_harvester wheel + Gradio runtime deps ─────
|
| 21 |
+
FROM base AS wheel
|
| 22 |
+
|
| 23 |
+
COPY dist/asset_harvester-1.0.0-py3-none-any.whl /tmp/
|
| 24 |
+
RUN pip install --no-cache-dir \
|
| 25 |
+
'/tmp/asset_harvester-1.0.0-py3-none-any.whl[multiview-diffusion,tokengs,camera-estimator]' \
|
| 26 |
+
'gradio>=5.14.0' spaces \
|
| 27 |
+
&& rm /tmp/asset_harvester-1.0.0-py3-none-any.whl
|
| 28 |
+
|
| 29 |
+
# ── Stage 3: gsplat from source (needs torch already installed) ──────
|
| 30 |
+
FROM wheel AS gsplat
|
| 31 |
+
|
| 32 |
+
ARG GSPLAT_COMMIT=b60e917c95afc449c5be33a634f1f457e116ff5e
|
| 33 |
+
ENV TORCH_CUDA_ARCH_LIST="8.0;8.6;9.0"
|
| 34 |
+
RUN pip install --no-cache-dir --no-build-isolation \
|
| 35 |
+
"git+https://github.com/nerfstudio-project/gsplat.git@${GSPLAT_COMMIT}"
|
| 36 |
+
|
| 37 |
+
# ── Stage 4: Final image ─────────────────────────────────────────────
|
| 38 |
+
FROM gsplat AS final
|
| 39 |
+
|
| 40 |
+
RUN useradd -m -u 1000 user \
|
| 41 |
+
&& mkdir -p /app/checkpoints \
|
| 42 |
+
&& chown -R 1000:1000 /app
|
| 43 |
+
|
| 44 |
+
# HF_TOKEN from build secret (optional — can also be passed at runtime via -e)
|
| 45 |
+
RUN --mount=type=secret,id=HF_TOKEN,mode=0444 \
|
| 46 |
+
if [ -f /run/secrets/HF_TOKEN ]; then \
|
| 47 |
+
echo "export HF_TOKEN=$(cat /run/secrets/HF_TOKEN)" > /etc/hf_env; \
|
| 48 |
+
else \
|
| 49 |
+
echo "# no build-time HF_TOKEN; provide via -e HF_TOKEN=..." > /etc/hf_env; \
|
| 50 |
+
fi \
|
| 51 |
+
&& chmod +x /etc/hf_env
|
| 52 |
+
|
| 53 |
+
RUN printf '#!/bin/bash\nsource /etc/hf_env\nexec "$@"\n' > /usr/local/bin/entrypoint.sh \
|
| 54 |
+
&& chmod +x /usr/local/bin/entrypoint.sh
|
| 55 |
+
|
| 56 |
+
COPY --chown=1000:1000 app.py /app/
|
| 57 |
+
COPY --chown=1000:1000 examples /app/examples
|
| 58 |
+
|
| 59 |
+
USER user
|
| 60 |
+
EXPOSE 7860
|
| 61 |
+
|
| 62 |
+
ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
|
| 63 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,15 +1,59 @@
|
|
| 1 |
---
|
| 2 |
title: Asset Harvester
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
python_version: '3.12'
|
| 9 |
-
app_file: app.py
|
| 10 |
pinned: false
|
| 11 |
-
|
| 12 |
-
short_description: 'Demo of nvidia/asset-harvester models '
|
| 13 |
---
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Asset Harvester
|
| 3 |
+
emoji: "\U0001F697"
|
| 4 |
+
colorFrom: green
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
|
|
|
|
|
|
| 8 |
pinned: false
|
| 9 |
+
short_description: Image-to-3D for autonomous-vehicle simulation assets
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Asset Harvester
|
| 13 |
+
|
| 14 |
+
[**Paper**](https://arxiv.org/abs/2604.18468) | [**Project Page**](https://research.nvidia.com/labs/sil/projects/asset-harvester/) | [**Code**](https://github.com/NVIDIA/asset-harvester) | [**Model**](https://huggingface.co/nvidia/asset-harvester) | [**Data**](https://huggingface.co/datasets/nvidia/PhysicalAI-Autonomous-Vehicles-NCore)
|
| 15 |
+
|
| 16 |
+
Upload one image of a single object (vehicle, pedestrian, cyclist, or other road object) and get back a complete 3D Gaussian splat asset ready for simulation.
|
| 17 |
+
|
| 18 |
+
## Pipeline
|
| 19 |
+
|
| 20 |
+
```
|
| 21 |
+
upload ─▶ image guard (optional) ─▶ object segmentation ─▶ recenter + pad
|
| 22 |
+
│
|
| 23 |
+
▼
|
| 24 |
+
3D Gaussian splat ◀── TokenGS lifting ◀── multiview diffusion ◀── camera estimation
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
1. **Object segmentation** (`AH_object_seg_jit.pt`) — Mask2Former JIT produces a binary mask of the foreground object at the uploaded image's native resolution.
|
| 28 |
+
2. **Camera estimation** (`AH_camera_estimator.safetensors`) — predicts camera pose, distance, FOV, and object dimensions (LWH). Shares the C-RADIO backbone with multiview diffusion to avoid loading it twice.
|
| 29 |
+
3. **Multiview diffusion** (`AH_multiview_diffusion.safetensors`) — SparseViewDiT generates 16 novel orbit views conditioned on the input image.
|
| 30 |
+
4. **TokenGS lifting** (`AH_tokengs_lifting.safetensors`) — feed-forward 3D Gaussian reconstructor lifts the 16 views to a full 3DGS asset.
|
| 31 |
+
|
| 32 |
+
## Outputs
|
| 33 |
+
|
| 34 |
+
- Multiview MP4 (16-frame orbit at 5fps).
|
| 35 |
+
- 3D Gaussian orbit render (MP4).
|
| 36 |
+
- Gaussian splat (PLY) ready for simulation engines.
|
| 37 |
+
|
| 38 |
+
## Hardware
|
| 39 |
+
|
| 40 |
+
Single NVIDIA GPU with compute capability ≥ 8.0 and ≥ 30 GB VRAM. Typical end-to-end runtime: **1-2 minutes** per image on A100/H100.
|
| 41 |
+
|
| 42 |
+
## Limitations
|
| 43 |
+
|
| 44 |
+
- Single-object only — images with multiple distinct subjects will use the largest mask and discard the rest.
|
| 45 |
+
- Heavily occluded objects or out-of-distribution subjects (e.g., objects not seen in driving logs) may produce hallucinated geometry.
|
| 46 |
+
- Image guard uses `meta-llama/Llama-Guard-3-11B-Vision` — enabling it adds ~20-30 s per run.
|
| 47 |
+
|
| 48 |
+
## Local deployment
|
| 49 |
+
|
| 50 |
+
```bash
|
| 51 |
+
docker build --build-arg HF_TOKEN=$HF_TOKEN -t asset-harvester .
|
| 52 |
+
docker run --gpus all -e HF_TOKEN=$HF_TOKEN -p 7860:7860 asset-harvester
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
Checkpoints are downloaded from [`nvidia/asset-harvester`](https://huggingface.co/nvidia/asset-harvester) on first run. `HF_TOKEN` must have access to that repo.
|
| 56 |
+
|
| 57 |
+
## Governing terms
|
| 58 |
+
|
| 59 |
+
Use of this system is governed by the [NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/).
|
app.py
ADDED
|
@@ -0,0 +1,601 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 2 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 3 |
+
|
| 4 |
+
"""Asset Harvester Gradio demo — single-image upload to 3D Gaussian splat."""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
import gc
|
| 9 |
+
import logging
|
| 10 |
+
import os
|
| 11 |
+
import random
|
| 12 |
+
import tempfile
|
| 13 |
+
import threading
|
| 14 |
+
import uuid
|
| 15 |
+
from functools import partial
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
import imageio
|
| 19 |
+
import numpy as np
|
| 20 |
+
import torch
|
| 21 |
+
import torchvision.transforms as T
|
| 22 |
+
from diffusers.schedulers import DPMSolverMultistepScheduler
|
| 23 |
+
from huggingface_hub import snapshot_download
|
| 24 |
+
from PIL import Image
|
| 25 |
+
|
| 26 |
+
class _SpacesStub:
|
| 27 |
+
@staticmethod
|
| 28 |
+
def GPU(*args, **kwargs):
|
| 29 |
+
def decorator(fn):
|
| 30 |
+
return fn
|
| 31 |
+
|
| 32 |
+
if args and callable(args[0]):
|
| 33 |
+
return args[0]
|
| 34 |
+
return decorator
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
import spaces
|
| 39 |
+
|
| 40 |
+
_HAS_SPACES = True
|
| 41 |
+
except ImportError:
|
| 42 |
+
_HAS_SPACES = False
|
| 43 |
+
spaces = _SpacesStub() # type: ignore[assignment]
|
| 44 |
+
|
| 45 |
+
if os.getenv("SPACE_ID") is None:
|
| 46 |
+
_HAS_SPACES = False
|
| 47 |
+
spaces = _SpacesStub() # type: ignore[assignment]
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
| 51 |
+
logger = logging.getLogger(__name__)
|
| 52 |
+
|
| 53 |
+
HF_CHECKPOINT_REPO = "nvidia/asset-harvester"
|
| 54 |
+
CHECKPOINTS_DIR = "/app/checkpoints" if os.path.isdir("/app") else os.path.join(os.getcwd(), "checkpoints")
|
| 55 |
+
|
| 56 |
+
MV_CKPT = "AH_multiview_diffusion.safetensors"
|
| 57 |
+
TOKENGS_CKPT = "AH_tokengs_lifting.safetensors"
|
| 58 |
+
AHC_CKPT = "AH_camera_estimator.safetensors"
|
| 59 |
+
SEG_CKPT = "AH_object_seg_jit.pt"
|
| 60 |
+
|
| 61 |
+
DEFAULT_NUM_STEPS = 30
|
| 62 |
+
DEFAULT_CFG_SCALE = 2.0
|
| 63 |
+
IMAGE_SIZE = 512
|
| 64 |
+
GRAY_VALUE = 128
|
| 65 |
+
SEG_INPUT_SIZE = (384, 384)
|
| 66 |
+
MIN_MASK_AREA_FRAC = 0.01
|
| 67 |
+
MAX_MASK_AREA_FRAC = 0.95
|
| 68 |
+
MIN_UPLOAD_SIDE = 256
|
| 69 |
+
|
| 70 |
+
_MODELS_LOCK = threading.Lock()
|
| 71 |
+
_MODELS: dict = {}
|
| 72 |
+
_CKPT_PATHS: dict[str, str] = {}
|
| 73 |
+
_SESSION_MVDATA: dict[str, object] = {}
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _load_seg_estimator_class():
|
| 77 |
+
"""Load Mask2FormerSegmentationEstimator directly from its source file, bypassing
|
| 78 |
+
`asset_harvester.ncore_parser.__init__` which pulls in the private `ncore` module."""
|
| 79 |
+
import importlib.util
|
| 80 |
+
|
| 81 |
+
import asset_harvester
|
| 82 |
+
|
| 83 |
+
pkg_root = os.path.dirname(asset_harvester.__file__)
|
| 84 |
+
source = os.path.join(pkg_root, "ncore_parser", "image_segmentation.py")
|
| 85 |
+
spec = importlib.util.spec_from_file_location("_ah_image_segmentation", source)
|
| 86 |
+
module = importlib.util.module_from_spec(spec)
|
| 87 |
+
spec.loader.exec_module(module)
|
| 88 |
+
return module.Mask2FormerSegmentationEstimator
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _download_checkpoints() -> None:
|
| 92 |
+
if _CKPT_PATHS:
|
| 93 |
+
return
|
| 94 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 95 |
+
local_dir = snapshot_download(
|
| 96 |
+
repo_id=HF_CHECKPOINT_REPO,
|
| 97 |
+
allow_patterns=[MV_CKPT, TOKENGS_CKPT, AHC_CKPT, SEG_CKPT],
|
| 98 |
+
local_dir=CHECKPOINTS_DIR,
|
| 99 |
+
token=hf_token,
|
| 100 |
+
)
|
| 101 |
+
for key, filename in (("mv", MV_CKPT), ("tokengs", TOKENGS_CKPT), ("ahc", AHC_CKPT), ("seg", SEG_CKPT)):
|
| 102 |
+
path = os.path.join(local_dir, filename)
|
| 103 |
+
if not os.path.isfile(path):
|
| 104 |
+
raise FileNotFoundError(f"Missing {filename} in {local_dir}")
|
| 105 |
+
_CKPT_PATHS[key] = path
|
| 106 |
+
logger.info("Checkpoints ready in %s", local_dir)
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _load_models(device: str) -> dict:
|
| 110 |
+
with _MODELS_LOCK:
|
| 111 |
+
if _MODELS:
|
| 112 |
+
return _MODELS
|
| 113 |
+
|
| 114 |
+
from asset_harvester.camera_estimator.inference import AHCEstimator
|
| 115 |
+
from asset_harvester.multiview_diffusion.pipelines import SparseViewDiTPipeline
|
| 116 |
+
from asset_harvester.multiview_diffusion.utils.model_builder import get_models
|
| 117 |
+
from asset_harvester.tokengs.lifting_inference import TokengsLiftingRunner
|
| 118 |
+
Mask2FormerSegmentationEstimator = _load_seg_estimator_class()
|
| 119 |
+
|
| 120 |
+
_download_checkpoints()
|
| 121 |
+
|
| 122 |
+
dtype = torch.bfloat16 if device.startswith("cuda") else torch.float32
|
| 123 |
+
|
| 124 |
+
logger.info("Loading MVD (+ VAE, c-radio)...")
|
| 125 |
+
vae, cradio_model, cradio_image_processor, transformer = get_models(
|
| 126 |
+
_CKPT_PATHS["mv"], device=device, dtype=dtype,
|
| 127 |
+
)
|
| 128 |
+
scheduler = DPMSolverMultistepScheduler(
|
| 129 |
+
num_train_timesteps=1000,
|
| 130 |
+
beta_schedule="scaled_linear",
|
| 131 |
+
prediction_type="flow_prediction",
|
| 132 |
+
flow_shift=1.0,
|
| 133 |
+
use_flow_sigmas=True,
|
| 134 |
+
)
|
| 135 |
+
pipeline = SparseViewDiTPipeline(
|
| 136 |
+
vae=vae,
|
| 137 |
+
text_encoder=None,
|
| 138 |
+
tokenizer=None,
|
| 139 |
+
scheduler=scheduler,
|
| 140 |
+
transformer=transformer,
|
| 141 |
+
image_encoder=cradio_model,
|
| 142 |
+
image_processor=cradio_image_processor,
|
| 143 |
+
).to(dtype)
|
| 144 |
+
|
| 145 |
+
logger.info("Loading AHC (shared c-radio)...")
|
| 146 |
+
ahc = AHCEstimator(
|
| 147 |
+
checkpoint_path=_CKPT_PATHS["ahc"],
|
| 148 |
+
device=device,
|
| 149 |
+
cradio_model=cradio_model,
|
| 150 |
+
cradio_image_processor=cradio_image_processor,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
logger.info("Loading segmentation JIT...")
|
| 154 |
+
seg = Mask2FormerSegmentationEstimator(
|
| 155 |
+
model_path=_CKPT_PATHS["seg"],
|
| 156 |
+
device=device,
|
| 157 |
+
input_size=SEG_INPUT_SIZE,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
logger.info("Loading TokenGS lifting...")
|
| 161 |
+
lifter = TokengsLiftingRunner(
|
| 162 |
+
_CKPT_PATHS["tokengs"], bbox_size=0.8, dtype=dtype, render_img_size=IMAGE_SIZE,
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
_MODELS.update(pipeline=pipeline, ahc=ahc, seg=seg, lifter=lifter, dtype=dtype, device=device)
|
| 166 |
+
return _MODELS
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
def _segment(seg, image_pil: Image.Image) -> np.ndarray:
|
| 170 |
+
"""Return a uint8 binary mask at the native image resolution."""
|
| 171 |
+
_, instance_seg = seg.predict(image_pil)
|
| 172 |
+
if len(instance_seg["classes"]) == 0:
|
| 173 |
+
return np.zeros((image_pil.height, image_pil.width), dtype=np.uint8)
|
| 174 |
+
|
| 175 |
+
mh, mw = SEG_INPUT_SIZE
|
| 176 |
+
unpacked = np.unpackbits(instance_seg["instance_masks"]).reshape(
|
| 177 |
+
len(instance_seg["classes"]), mh, mw,
|
| 178 |
+
)
|
| 179 |
+
areas = unpacked.sum(axis=(1, 2))
|
| 180 |
+
biggest = unpacked[int(np.argmax(areas))].astype(np.uint8) * 255
|
| 181 |
+
mask_pil = Image.fromarray(biggest, mode="L").resize(
|
| 182 |
+
(image_pil.width, image_pil.height), Image.NEAREST,
|
| 183 |
+
)
|
| 184 |
+
return np.array(mask_pil)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _recenter_and_pad(image_pil: Image.Image, mask_np: np.ndarray) -> tuple[Image.Image, Image.Image]:
|
| 188 |
+
"""Translate image+mask so the mask centroid lands at frame center, square-pad, resize to 512.
|
| 189 |
+
|
| 190 |
+
Image padding uses GRAY_VALUE (matches AHC's apply_mask background). Mask padding uses 0.
|
| 191 |
+
Raises ValueError on degenerate masks.
|
| 192 |
+
"""
|
| 193 |
+
H, W = mask_np.shape
|
| 194 |
+
ys, xs = np.where(mask_np > 0)
|
| 195 |
+
if ys.size == 0:
|
| 196 |
+
raise ValueError("No object detected in the input image. Try a cleaner photo with a single subject.")
|
| 197 |
+
|
| 198 |
+
area_frac = ys.size / (H * W)
|
| 199 |
+
if area_frac < MIN_MASK_AREA_FRAC:
|
| 200 |
+
raise ValueError(f"Detected object is too small ({area_frac * 100:.1f}% of image).")
|
| 201 |
+
if area_frac > MAX_MASK_AREA_FRAC:
|
| 202 |
+
raise ValueError(
|
| 203 |
+
f"Detected object fills nearly the whole image ({area_frac * 100:.1f}%); provide a wider-angle photo."
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
y0, y1 = int(ys.min()), int(ys.max())
|
| 207 |
+
x0, x1 = int(xs.min()), int(xs.max())
|
| 208 |
+
if y0 == 0 and y1 == H - 1 and x0 == 0 and x1 == W - 1:
|
| 209 |
+
raise ValueError("Object touches all four edges; provide an image showing the full object.")
|
| 210 |
+
|
| 211 |
+
cy = float(ys.mean())
|
| 212 |
+
cx = float(xs.mean())
|
| 213 |
+
side_y = int(np.ceil(2 * max(cy, H - cy)))
|
| 214 |
+
side_x = int(np.ceil(2 * max(cx, W - cx)))
|
| 215 |
+
side = max(side_y, side_x, H, W)
|
| 216 |
+
|
| 217 |
+
paste_y = side // 2 - int(round(cy))
|
| 218 |
+
paste_x = side // 2 - int(round(cx))
|
| 219 |
+
|
| 220 |
+
canvas_img = np.full((side, side, 3), GRAY_VALUE, dtype=np.uint8)
|
| 221 |
+
canvas_msk = np.zeros((side, side), dtype=np.uint8)
|
| 222 |
+
img_np = np.array(image_pil.convert("RGB"))
|
| 223 |
+
canvas_img[paste_y : paste_y + H, paste_x : paste_x + W] = img_np
|
| 224 |
+
canvas_msk[paste_y : paste_y + H, paste_x : paste_x + W] = mask_np
|
| 225 |
+
|
| 226 |
+
out_img = Image.fromarray(canvas_img).resize((IMAGE_SIZE, IMAGE_SIZE), Image.BILINEAR)
|
| 227 |
+
out_msk = Image.fromarray(canvas_msk).resize((IMAGE_SIZE, IMAGE_SIZE), Image.NEAREST)
|
| 228 |
+
return out_img, out_msk
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
def _run_image_guard(image_pil: Image.Image, device: str, dtype: torch.dtype) -> None:
|
| 232 |
+
from asset_harvester.utils.image_guard import ImageGuard
|
| 233 |
+
|
| 234 |
+
guard = ImageGuard(device=device, dtype=dtype)
|
| 235 |
+
try:
|
| 236 |
+
guard.load()
|
| 237 |
+
result = guard.check_image(image_pil)
|
| 238 |
+
finally:
|
| 239 |
+
guard.unload()
|
| 240 |
+
gc.collect()
|
| 241 |
+
if torch.cuda.is_available():
|
| 242 |
+
torch.cuda.empty_cache()
|
| 243 |
+
if not result.passed:
|
| 244 |
+
raise gr.Error(f"Image rejected by safety check (label={result.label}, score={result.score:.2f}).")
|
| 245 |
+
|
| 246 |
+
|
| 247 |
+
def _build_mvdata(image_pil: Image.Image, mask_pil: Image.Image, ahc):
|
| 248 |
+
from asset_harvester.multiview_diffusion.data.nre_preproc import MVData
|
| 249 |
+
|
| 250 |
+
tmp = tempfile.mkdtemp(prefix="ah_upload_")
|
| 251 |
+
frame_p = os.path.join(tmp, "frame_0.jpg")
|
| 252 |
+
mask_p = os.path.join(tmp, "mask_0.png")
|
| 253 |
+
image_pil.save(frame_p, quality=95)
|
| 254 |
+
mask_pil.save(mask_p)
|
| 255 |
+
|
| 256 |
+
cam_data = ahc.run([(frame_p, mask_p)])
|
| 257 |
+
return MVData(
|
| 258 |
+
clip_id="upload",
|
| 259 |
+
obj_id="0",
|
| 260 |
+
frames=[np.array(image_pil)],
|
| 261 |
+
cam_poses=np.array(cam_data["cam_poses"], dtype=np.float32),
|
| 262 |
+
dists=np.array(cam_data["dists"], dtype=np.float32),
|
| 263 |
+
fov=np.array(cam_data["fov"], dtype=np.float32),
|
| 264 |
+
npct="vehicle",
|
| 265 |
+
lwh=np.array(cam_data["lwh"], dtype=np.float32),
|
| 266 |
+
masks=[np.array(mask_pil)],
|
| 267 |
+
auto_label=None,
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
|
| 271 |
+
def _encode_mp4(frames_np, path: str, fps: int = 24) -> None:
|
| 272 |
+
imageio.v2.mimwrite(path, frames_np, fps=fps, macro_block_size=1)
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
@spaces.GPU(duration=60)
|
| 276 |
+
def run_segmentation(image_pil, is_example: bool = False, progress=gr.Progress()):
|
| 277 |
+
"""First stage: safety check + segmentation + recentering + camera estimation.
|
| 278 |
+
|
| 279 |
+
Returns (mask_preview, state) where state is handed to `run_3d`.
|
| 280 |
+
Progress shown only on the mask image output.
|
| 281 |
+
"""
|
| 282 |
+
if image_pil is None:
|
| 283 |
+
raise gr.Error("Please upload an image.")
|
| 284 |
+
if min(image_pil.size) < MIN_UPLOAD_SIDE:
|
| 285 |
+
raise gr.Error(f"Image too small ({image_pil.size[0]}x{image_pil.size[1]}); min {MIN_UPLOAD_SIDE}px per side.")
|
| 286 |
+
|
| 287 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 288 |
+
|
| 289 |
+
progress(0.1, desc="Loading models…")
|
| 290 |
+
models = _load_models(device)
|
| 291 |
+
dtype = models["dtype"]
|
| 292 |
+
|
| 293 |
+
image_pil = image_pil.convert("RGB")
|
| 294 |
+
|
| 295 |
+
if is_example:
|
| 296 |
+
progress(0.3, desc="Skipping safety check (curated example)…")
|
| 297 |
+
else:
|
| 298 |
+
progress(0.3, desc="Running safety check…")
|
| 299 |
+
_run_image_guard(image_pil, device, dtype)
|
| 300 |
+
|
| 301 |
+
progress(0.6, desc="Segmenting object…")
|
| 302 |
+
mask_np = _segment(models["seg"], image_pil)
|
| 303 |
+
|
| 304 |
+
progress(0.8, desc="Recentering and estimating camera…")
|
| 305 |
+
try:
|
| 306 |
+
centered_img, centered_mask = _recenter_and_pad(image_pil, mask_np)
|
| 307 |
+
except ValueError as e:
|
| 308 |
+
raise gr.Error(str(e))
|
| 309 |
+
|
| 310 |
+
rgb = np.array(image_pil)
|
| 311 |
+
fg = (mask_np > 0).astype(np.uint8)[:, :, None]
|
| 312 |
+
mask_preview = Image.fromarray(np.where(fg, rgb, np.full_like(rgb, GRAY_VALUE)).astype(np.uint8))
|
| 313 |
+
|
| 314 |
+
mvdata = _build_mvdata(centered_img, centered_mask, models["ahc"])
|
| 315 |
+
|
| 316 |
+
uid = str(uuid.uuid4())
|
| 317 |
+
_SESSION_MVDATA[uid] = mvdata
|
| 318 |
+
|
| 319 |
+
progress(1.0, desc="Done")
|
| 320 |
+
return mask_preview, uid
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@spaces.GPU(duration=180)
|
| 324 |
+
def run_3d(state, progress=gr.Progress()):
|
| 325 |
+
"""Second stage: multiview diffusion + TokenGS lifting.
|
| 326 |
+
|
| 327 |
+
Returns (orbit_mp4_path, ply_path) matching outputs=[video_out, ply_out].
|
| 328 |
+
"""
|
| 329 |
+
if not state or state not in _SESSION_MVDATA:
|
| 330 |
+
raise gr.Error("Segmentation must run first.")
|
| 331 |
+
|
| 332 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 333 |
+
models = _load_models(device)
|
| 334 |
+
pipeline = models["pipeline"]
|
| 335 |
+
lifter = models["lifter"]
|
| 336 |
+
mvdata = _SESSION_MVDATA.pop(state)
|
| 337 |
+
|
| 338 |
+
from asset_harvester.multiview_diffusion.data.inference_utils import build_eval_cams
|
| 339 |
+
from asset_harvester.multiview_diffusion.data.nre_preproc import preproc
|
| 340 |
+
|
| 341 |
+
progress(0.05, desc="Preparing multiview conditioning…")
|
| 342 |
+
transform = T.Compose(
|
| 343 |
+
[T.Resize(IMAGE_SIZE), T.ToTensor(), T.Normalize([0.5], [0.5])]
|
| 344 |
+
)
|
| 345 |
+
inference_preproc = partial(
|
| 346 |
+
preproc,
|
| 347 |
+
image_transform=transform,
|
| 348 |
+
resolution=IMAGE_SIZE,
|
| 349 |
+
conditioning_mode="n",
|
| 350 |
+
eval_mode=True,
|
| 351 |
+
eval_cam_sampler=build_eval_cams,
|
| 352 |
+
)
|
| 353 |
+
data_dict = inference_preproc(mvdata)
|
| 354 |
+
|
| 355 |
+
max_length = data_dict.n_target + min(4, len(data_dict.x) - data_dict.n_target)
|
| 356 |
+
for attr in ("x", "c2w_relatives", "x_white_background", "dists", "fovs", "plucker_image", "relative_brightness"):
|
| 357 |
+
if hasattr(data_dict, attr):
|
| 358 |
+
setattr(data_dict, attr, getattr(data_dict, attr)[:max_length])
|
| 359 |
+
if hasattr(data_dict, "intrinsics") and data_dict.intrinsics.shape[0] > max_length:
|
| 360 |
+
data_dict.intrinsics = data_dict.intrinsics[:max_length]
|
| 361 |
+
|
| 362 |
+
progress(0.15, desc="Generating multiview images…")
|
| 363 |
+
with torch.no_grad():
|
| 364 |
+
output = pipeline(
|
| 365 |
+
data_dict=data_dict,
|
| 366 |
+
num_inference_steps=DEFAULT_NUM_STEPS,
|
| 367 |
+
guidance_scale=DEFAULT_CFG_SCALE,
|
| 368 |
+
flow_shift=1.0,
|
| 369 |
+
output_type="pil",
|
| 370 |
+
)
|
| 371 |
+
images_np = [np.array(img) for img in output["images"]]
|
| 372 |
+
|
| 373 |
+
progress(0.55, desc="Lifting to 3D Gaussian splat…")
|
| 374 |
+
output_dir = tempfile.mkdtemp(prefix="ah_out_")
|
| 375 |
+
offload_ok = False
|
| 376 |
+
try:
|
| 377 |
+
if torch.cuda.is_available():
|
| 378 |
+
for name in ("vae", "transformer", "image_encoder"):
|
| 379 |
+
m = getattr(pipeline, name, None)
|
| 380 |
+
if m is not None:
|
| 381 |
+
m.to("cpu")
|
| 382 |
+
pipeline.to("cpu")
|
| 383 |
+
offload_ok = True
|
| 384 |
+
gc.collect()
|
| 385 |
+
if torch.cuda.is_available():
|
| 386 |
+
torch.cuda.empty_cache()
|
| 387 |
+
|
| 388 |
+
fov = float(data_dict.fovs[0].item())
|
| 389 |
+
dist = float(data_dict.dists[0].item())
|
| 390 |
+
lwh = data_dict.lwh if hasattr(data_dict, "lwh") and data_dict.lwh is not None else [1.0, 1.0, 1.0]
|
| 391 |
+
with torch.no_grad():
|
| 392 |
+
gaussians = lifter.run_lifting(images_np, fov, dist, lwh)
|
| 393 |
+
|
| 394 |
+
progress(0.85, desc="Rendering orbit views of the lifted splat…")
|
| 395 |
+
with torch.no_grad():
|
| 396 |
+
rendered = lifter.render_orbit_views(gaussians, fov, dist, lwh)
|
| 397 |
+
rendered_np = [im.permute(1, 2, 0).cpu().numpy() for im in rendered]
|
| 398 |
+
orbit_mp4 = os.path.join(output_dir, "lifting.mp4")
|
| 399 |
+
_encode_mp4(rendered_np, orbit_mp4)
|
| 400 |
+
|
| 401 |
+
progress(0.95, desc="Saving Gaussian splat…")
|
| 402 |
+
ply_path = os.path.join(output_dir, "gaussians.ply")
|
| 403 |
+
lifter.save_ply(gaussians, ply_path)
|
| 404 |
+
finally:
|
| 405 |
+
if offload_ok and torch.cuda.is_available():
|
| 406 |
+
for name in ("vae", "transformer", "image_encoder"):
|
| 407 |
+
m = getattr(pipeline, name, None)
|
| 408 |
+
if m is not None:
|
| 409 |
+
m.to(device)
|
| 410 |
+
pipeline.to(device)
|
| 411 |
+
gc.collect()
|
| 412 |
+
if torch.cuda.is_available():
|
| 413 |
+
torch.cuda.empty_cache()
|
| 414 |
+
|
| 415 |
+
progress(1.0, desc="Done")
|
| 416 |
+
return orbit_mp4, ply_path
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
HEADER_MD = """
|
| 420 |
+
## Image to 3D Asset with [Asset Harvester](https://github.com/NVIDIA/asset-harvester)
|
| 421 |
+
|
| 422 |
+
[**Paper**](https://arxiv.org/abs/2604.18468) | [**Project Page**](https://research.nvidia.com/labs/sil/projects/asset-harvester/) | [**Code**](https://github.com/NVIDIA/asset-harvester) | [**Model**](https://huggingface.co/nvidia/asset-harvester) | [**Data**](https://huggingface.co/datasets/nvidia/PhysicalAI-Autonomous-Vehicles-NCore)
|
| 423 |
+
|
| 424 |
+
**Upload a single image of one object — a vehicle, pedestrian, cyclist, or other road object — to generate a 3D Gaussian splat asset. The assumed inputs are images cropped and rectified from AV datasets, like the example images below. However, you can also challenge the model with internet photos.**
|
| 425 |
+
|
| 426 |
+
The inference pipeline consists of:
|
| 427 |
+
|
| 428 |
+
- **Object Segmentation** — isolates the object from the background.
|
| 429 |
+
- **Camera Estimation** — predicts the viewing direction, distance, field of view, and object dimensions.
|
| 430 |
+
- **Multiview Diffusion** — generates 16 novel orbit views.
|
| 431 |
+
- **3D Lifting** — reconstructs the generated views into a 3D Gaussian splat (downloadable PLY).
|
| 432 |
+
"""
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
def build_ui():
|
| 436 |
+
theme = gr.themes.Default(primary_hue="green", neutral_hue="slate")
|
| 437 |
+
app_css = """
|
| 438 |
+
/* Base typography */
|
| 439 |
+
.gradio-container { font-size: 20px !important; }
|
| 440 |
+
.gradio-container .prose p, .gradio-container .prose li,
|
| 441 |
+
.gradio-container .md p, .gradio-container .md li { font-size: 1.2rem !important; line-height: 1.6 !important; }
|
| 442 |
+
.gradio-container .prose h2, .gradio-container .md h2 { font-size: 2rem !important; }
|
| 443 |
+
.gradio-container .block-label, .gradio-container button { font-size: 1.1rem !important; }
|
| 444 |
+
|
| 445 |
+
/* Fluid media — images/videos fill their column, keep aspect ratio */
|
| 446 |
+
.gradio-container .image-container img,
|
| 447 |
+
.gradio-container .video-container video { max-width: 100% !important; max-height: 100% !important;
|
| 448 |
+
width: auto !important; height: auto !important;
|
| 449 |
+
object-fit: contain !important; }
|
| 450 |
+
.gradio-container .image-container, .gradio-container .video-container
|
| 451 |
+
{ display: flex !important; align-items: center !important; justify-content: center !important; }
|
| 452 |
+
|
| 453 |
+
/* Narrow viewports: let columns wrap instead of cramming */
|
| 454 |
+
@media (max-width: 1024px) {
|
| 455 |
+
.gradio-container .prose h2, .gradio-container .md h2 { font-size: 1.7rem !important; }
|
| 456 |
+
.gradio-container .prose p, .gradio-container .prose li,
|
| 457 |
+
.gradio-container .md p, .gradio-container .md li { font-size: 1.1rem !important; }
|
| 458 |
+
}
|
| 459 |
+
@media (max-width: 720px) {
|
| 460 |
+
.gradio-container { font-size: 18px !important; }
|
| 461 |
+
/* Force columns in the main Row to take full width, stack vertically */
|
| 462 |
+
.gradio-container .grid-wrap { grid-template-columns: 1fr !important; }
|
| 463 |
+
}
|
| 464 |
+
"""
|
| 465 |
+
with gr.Blocks(title="Asset Harvester", css=app_css) as demo:
|
| 466 |
+
gr.Markdown(HEADER_MD)
|
| 467 |
+
|
| 468 |
+
image_in = gr.Image(
|
| 469 |
+
label="Image Prompt",
|
| 470 |
+
type="pil",
|
| 471 |
+
height=360,
|
| 472 |
+
sources=["upload", "clipboard"],
|
| 473 |
+
render=False,
|
| 474 |
+
)
|
| 475 |
+
|
| 476 |
+
examples_dir = os.path.join(os.path.dirname(__file__), "examples")
|
| 477 |
+
all_examples = [
|
| 478 |
+
[os.path.join(examples_dir, f)]
|
| 479 |
+
for f in sorted(os.listdir(examples_dir))
|
| 480 |
+
if f.lower().endswith((".jpeg", ".jpg", ".png"))
|
| 481 |
+
]
|
| 482 |
+
|
| 483 |
+
with gr.Row():
|
| 484 |
+
with gr.Column(scale=2, min_width=200):
|
| 485 |
+
examples_ds = gr.Dataset(
|
| 486 |
+
components=[image_in],
|
| 487 |
+
samples=all_examples,
|
| 488 |
+
samples_per_page=18,
|
| 489 |
+
label="Example images",
|
| 490 |
+
)
|
| 491 |
+
with gr.Column(scale=4, min_width=360):
|
| 492 |
+
image_in.render()
|
| 493 |
+
gr.Markdown(
|
| 494 |
+
"**Notes:**\n\n"
|
| 495 |
+
"* **For best results, please upload clear, object-centric images "
|
| 496 |
+
"where the camera is level with the object, similar to rectified "
|
| 497 |
+
"ego-viewpoint images in our AV setting.**\n"
|
| 498 |
+
"* The uploaded images are screened with "
|
| 499 |
+
"[Llama Guard 3 Vision](https://huggingface.co/meta-llama/Llama-Guard-3-11B-Vision) "
|
| 500 |
+
"to filter out harmful content."
|
| 501 |
+
)
|
| 502 |
+
run_btn = gr.Button("Generate 3D Asset", variant="primary")
|
| 503 |
+
gr.Markdown(
|
| 504 |
+
"<p style='font-size: 1rem; margin: 0.5rem 0;'>"
|
| 505 |
+
"<b>Disclaimer:</b> Asset Harvester is trained for the AV domain, "
|
| 506 |
+
"and its performance is not guaranteed on arbitrary images."
|
| 507 |
+
"</p>"
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
with gr.Column(scale=5, min_width=400):
|
| 511 |
+
mask_out = gr.Image(
|
| 512 |
+
label="Object Segmentation",
|
| 513 |
+
type="pil",
|
| 514 |
+
height=400,
|
| 515 |
+
)
|
| 516 |
+
video_out = gr.Video(
|
| 517 |
+
label="3D Gaussian Splat — Orbit Render",
|
| 518 |
+
height=400,
|
| 519 |
+
autoplay=True,
|
| 520 |
+
loop=True,
|
| 521 |
+
)
|
| 522 |
+
ply_out = gr.DownloadButton(
|
| 523 |
+
label="Download PLY",
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
stage_state = gr.State()
|
| 527 |
+
is_example = gr.State(False)
|
| 528 |
+
|
| 529 |
+
def _pick_example(sample):
|
| 530 |
+
return sample[0] if isinstance(sample, (list, tuple)) else sample
|
| 531 |
+
|
| 532 |
+
examples_ds.click(
|
| 533 |
+
_pick_example, inputs=examples_ds, outputs=image_in
|
| 534 |
+
).then(lambda: True, outputs=is_example)
|
| 535 |
+
|
| 536 |
+
image_in.input(lambda: False, outputs=is_example)
|
| 537 |
+
image_in.clear(lambda: False, outputs=is_example)
|
| 538 |
+
|
| 539 |
+
def _shuffled_examples():
|
| 540 |
+
shuffled = all_examples.copy()
|
| 541 |
+
random.shuffle(shuffled)
|
| 542 |
+
return gr.update(samples=shuffled)
|
| 543 |
+
|
| 544 |
+
demo.load(_shuffled_examples, inputs=None, outputs=examples_ds)
|
| 545 |
+
|
| 546 |
+
run_btn.click(
|
| 547 |
+
fn=run_segmentation,
|
| 548 |
+
inputs=[image_in, is_example],
|
| 549 |
+
outputs=[mask_out, stage_state],
|
| 550 |
+
show_progress="full",
|
| 551 |
+
show_progress_on=[mask_out],
|
| 552 |
+
concurrency_id="seg",
|
| 553 |
+
concurrency_limit=2,
|
| 554 |
+
).then(
|
| 555 |
+
fn=run_3d,
|
| 556 |
+
inputs=[stage_state],
|
| 557 |
+
outputs=[video_out, ply_out],
|
| 558 |
+
show_progress="full",
|
| 559 |
+
concurrency_id="gpu3d",
|
| 560 |
+
concurrency_limit=1,
|
| 561 |
+
)
|
| 562 |
+
demo.queue(default_concurrency_limit=1, max_size=30)
|
| 563 |
+
return demo, theme
|
| 564 |
+
|
| 565 |
+
|
| 566 |
+
def _prefetch_all(device: str) -> None:
|
| 567 |
+
"""Warm checkpoints and load the main pipeline models into memory at startup.
|
| 568 |
+
|
| 569 |
+
Image guard (Llama Guard 3 Vision) weights are prefetched to disk cache only —
|
| 570 |
+
they are load/unloaded per-call because the model is large (~22 GB on GPU).
|
| 571 |
+
"""
|
| 572 |
+
logger.info("Prefetching asset-harvester checkpoints...")
|
| 573 |
+
_download_checkpoints()
|
| 574 |
+
|
| 575 |
+
logger.info("Loading pipeline / AHC / segmentation / TokenGS into memory...")
|
| 576 |
+
_load_models(device)
|
| 577 |
+
|
| 578 |
+
logger.info("Prefetching Llama Guard 3 Vision weights to disk cache...")
|
| 579 |
+
try:
|
| 580 |
+
snapshot_download(
|
| 581 |
+
repo_id="meta-llama/Llama-Guard-3-11B-Vision",
|
| 582 |
+
allow_patterns=["*.json", "*.safetensors", "*.txt", "*.model", "tokenizer*"],
|
| 583 |
+
token=os.getenv("HF_TOKEN"),
|
| 584 |
+
)
|
| 585 |
+
logger.info("Image guard weights cached.")
|
| 586 |
+
except Exception as e:
|
| 587 |
+
logger.warning(
|
| 588 |
+
"Could not prefetch Llama Guard weights (will download on first safety check): %s", e,
|
| 589 |
+
)
|
| 590 |
+
|
| 591 |
+
logger.info("Startup prefetch complete.")
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
if os.getenv("AH_PREFETCH", "1") == "1":
|
| 595 |
+
_startup_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 596 |
+
_prefetch_all(_startup_device)
|
| 597 |
+
demo, _theme = build_ui()
|
| 598 |
+
|
| 599 |
+
|
| 600 |
+
if __name__ == "__main__":
|
| 601 |
+
demo.launch(server_name="0.0.0.0", server_port=7860, max_threads=40, theme=_theme)
|
dist/asset_harvester-1.0.0-py3-none-any.whl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60004f835c196d9e9a3e3cf017aede4c6b3c189c0d2bf6578a449c15f85bc3f0
|
| 3 |
+
size 198831
|
examples/VRU_pedestrians_0d7b602f2da8c364.jpeg
ADDED
|
Git LFS Details
|
examples/VRU_pedestrians_723ce847bf6b1671.jpeg
ADDED
|
Git LFS Details
|
examples/VRU_pedestrians_c2d728e02d4d11cc.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_00c7f5b5caa9e7d0.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_00e617a279b7f517.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_00e9ab349b437b2c.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_03271db9979f6072.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_039b7b7af4bd853b.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_044dfeb890d95741.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_04acf10a71d112a1.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_04cbe39ba786858d.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_05abef8311f6ca8c.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_0650ef1d75757b0e.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_0742aaf29c0a7090.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_07bf69847a2eae86.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_095cdc57d3186c66.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_0a5ccea0b758dd89.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_0d21d1c69e594ca7.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_0fc4baf8c34411e8.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_125e8d7a5a5ab518.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_13ee50f6c1e8e494.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_14030c6da90d58a8.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_14586bfbf8da0dd7.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_14b077380d557c2b.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1585b4e264e88112.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1704da3176d628b1.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_17289d1f0904c980.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1875f1efbced2624.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_18f3d87e7b85d808.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_191dbee26f68e8ca.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1a001d763cacdaa6.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1b42127109e81f09.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1bdd779f1bc8a22e.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1ca3d3c4b08fa14a.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_1ee187f725a01351.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_2054eba237562446.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_23a0a9163760a5c1.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_2516c24ad21db02a.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_27a5512681e556e2.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_292e62704c0e9f28.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_29dfaf2cdbc2385a.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_2bf70aae9df266eb.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_2cab9afbbcc99c9e.jpeg
ADDED
|
Git LFS Details
|
examples/automobile_30a50721545fbffe.jpeg
ADDED
|
Git LFS Details
|