Spaces:

AIDC-AI
/

Ovis-U1-3B

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 17 days ago

Commit

37094bb

verified ·

1 Parent(s): 9a6780b

[Admin maintenance] Support new ZeroGPU hardware

Browse files

Thank you so much for having shared this Space with the community on this demo. We have upgraded the ZeroGPU infra-structure to run on modern blackwell architecture.
For that, we need to upgrade your demo to support that. This PR fixes your demo to work with the new architecture. As this is something we broke on our end, we may merge this PR autonomously. If this breaks unexpectedly or brings unintended consequences, feel free to revert, modify or otherwise. Any issues you can email apolinario@huggingface.co

Files changed (7) hide show

app.py +0 -2
flash_attn/__init__.py +24 -0
flash_attn/flash_attn_interface.py +112 -0
flash_attn/funcs.py +54 -0
flash_attn/layers/__init__.py +0 -0
flash_attn/layers/rotary.py +51 -0
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import os
-import subprocess
-subprocess.run('pip install flash-attn==2.6.3 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import random
 import spaces
 import numpy as np

 import os
 import random
 import spaces
 import numpy as np

flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Minimal torch-native shim for flash_attn used by AIDC-AI/Ovis-U1-3B.
+The upstream modeling file imports:
+    from flash_attn.layers.rotary import apply_rotary_emb
+    from flash_attn import flash_attn_varlen_func
+Blackwell/CUDA-13 has no flash-attn prebuilt wheel for cp310+torch>=2.10, and the
+package's CUDA build doesn't fit within the @spaces.GPU 1500s budget, so we
+provide a small torch-native equivalent that satisfies the two call sites the
+model actually exercises.
+We also fake a version string within the range xformers tolerates so that
+``xformers/ops/fmha/flash.py`` (loaded transitively by ``diffusers``) does not
+explode at import time. The xformers FA backend it then registers will never
+be invoked along the user-facing demo path (the model uses transformers SDPA
+attention + this shim's varlen path; diffusers' xformers backend is only
+engaged via an explicit ``set_use_memory_efficient_attention_xformers`` opt-in
+which the demo never makes).
+"""
+__version__ = "2.8.3"
+from .funcs import flash_attn_varlen_func  # noqa: F401
+from . import flash_attn_interface  # noqa: F401  -- expose submodule eagerly

flash_attn/flash_attn_interface.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Stub of flash_attn.flash_attn_interface.
+Re-exports the torch-native ``flash_attn_varlen_func`` from the shim plus a
+``flash_attn_func`` fallback that uses ``torch.nn.functional.scaled_dot_product_attention``
+for the padded (batch, seqlen, nheads, headdim) call signature.
+Also exposes a placeholder ``flash_attn_gpu`` object so xformers'
+``hasattr(flash_attn.flash_attn_interface, "flash_attn_gpu")`` probe in
+``xformers/ops/fmha/flash.py`` succeeds. The backend xformers registers from
+this probe is never invoked along the demo's user-facing path.
+"""
+import torch
+import torch.nn.functional as F
+from .funcs import flash_attn_varlen_func  # noqa: F401
+class _UnavailableBackend:
+    """Opaque placeholder; calling any attribute raises a clear error."""
+    def __getattr__(self, name):
+        raise RuntimeError(
+            "flash_attn shim: real CUDA backend is not installed. "
+            "The demo's user-facing path should not need it."
+        )
+flash_attn_gpu = _UnavailableBackend()
+flash_attn_cuda = _UnavailableBackend()
+def flash_attn_func(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """Padded attention. q/k/v shape: (B, L, H, D). Returns (B, L, H, D)."""
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** -0.5
+    # SDPA expects (B, H, L, D)
+    q_t = q.transpose(1, 2)
+    k_t = k.transpose(1, 2)
+    v_t = v.transpose(1, 2)
+    out = F.scaled_dot_product_attention(
+        q_t, k_t, v_t,
+        dropout_p=dropout_p,
+        is_causal=causal,
+        scale=softmax_scale,
+    )
+    return out.transpose(1, 2)
+def flash_attn_qkvpacked_func(
+    qkv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """qkv shape: (B, L, 3, H, D)."""
+    q, k, v = qkv.unbind(dim=2)
+    return flash_attn_func(
+        q, k, v,
+        dropout_p=dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=return_attn_probs,
+    )
+def flash_attn_kvpacked_func(
+    q,
+    kv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """q shape: (B, Lq, H, D), kv shape: (B, Lk, 2, H, D)."""
+    k, v = kv.unbind(dim=2)
+    return flash_attn_func(
+        q, k, v,
+        dropout_p=dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=return_attn_probs,
+    )

flash_attn/funcs.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Torch-native equivalent of flash_attn.flash_attn_varlen_func.
+Only the forward path used by AIMv2's vision tower is implemented:
+    flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)
+q, k, v are packed (total_tokens, num_heads, head_dim) tensors and the function
+returns the same packed (total_tokens, num_heads, head_dim) shape after applying
+self-attention per sub-sequence as encoded by `cu_seqlens_q == cu_seqlens_k`.
+"""
+import torch
+import torch.nn.functional as F
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** -0.5
+    cu_q = cu_seqlens_q.tolist()
+    cu_k = cu_seqlens_k.tolist()
+    out_chunks = []
+    for i in range(len(cu_q) - 1):
+        sq, eq = cu_q[i], cu_q[i + 1]
+        sk, ek = cu_k[i], cu_k[i + 1]
+        q_i = q[sq:eq].transpose(0, 1).unsqueeze(0)  # (1, H, Lq, D)
+        k_i = k[sk:ek].transpose(0, 1).unsqueeze(0)
+        v_i = v[sk:ek].transpose(0, 1).unsqueeze(0)
+        o_i = F.scaled_dot_product_attention(
+            q_i, k_i, v_i,
+            dropout_p=dropout_p,
+            is_causal=causal,
+            scale=softmax_scale,
+        )
+        out_chunks.append(o_i.squeeze(0).transpose(0, 1))  # (Lq, H, D)
+    out = torch.cat(out_chunks, dim=0)
+    return out

flash_attn/layers/__init__.py ADDED Viewed

File without changes

flash_attn/layers/rotary.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Torch-native equivalent of flash_attn.layers.rotary.apply_rotary_emb.
+Mirrors the flash_attn `apply_rotary_emb_torch` reference implementation:
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+"""
+import torch
+def _rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    out = torch.stack((-x2, x1), dim=-1)
+    return out.flatten(-2)
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets=0,
+    cu_seqlens=None,
+    max_seqlen=None,
+):
+    """Pure-torch rotary embedding application.
+    The Ovis aimv2 call site uses the simple case: no `cu_seqlens`, no
+    `seqlen_offsets`, default `interleaved=False`.
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1], f"rotary dim {ro_dim} exceeds head dim {x.shape[-1]}"
+    # Broadcast cos/sin from (..., rotary_dim/2) up to (..., 1, rotary_dim)
+    if interleaved:
+        cos = cos.unsqueeze(-2).repeat_interleave(2, dim=-1)
+        sin = sin.unsqueeze(-2).repeat_interleave(2, dim=-1)
+    else:
+        cos = cos.unsqueeze(-2)
+        sin = sin.unsqueeze(-2)
+        cos = torch.cat([cos, cos], dim=-1)
+        sin = torch.cat([sin, sin], dim=-1)
+    x_rot = x[..., :ro_dim]
+    x_pass = x[..., ro_dim:]
+    out_rot = x_rot * cos + _rotate_half(x_rot, interleaved) * sin
+    return torch.cat([out_rot, x_pass], dim=-1)

requirements.txt CHANGED Viewed

@@ -1,17 +1,18 @@
-torch==2.4.0
 transformers==4.51.3
 tokenizers==0.21.1
 sentencepiece==0.1.99
 pyarrow==18.0.0
 accelerate==1.1.0
-pydantic==2.8.2
 markdown2[all]
-numpy==1.24.3
 scikit-learn==1.2.2
 requests
 httpx
 uvicorn
-fastapi==0.112.4
 einops==0.6.1
 einops-exts==0.0.4
 timm==1.0.11
@@ -19,7 +20,7 @@ tiktoken
 transformers_stream_generator==0.0.4
 scipy
 pandas
-torchaudio
 xformers
 pillow==10.3.0
 pysubs2==1.7.2

+torch==2.10.0
+torchvision==0.25.0
 transformers==4.51.3
 tokenizers==0.21.1
 sentencepiece==0.1.99
 pyarrow==18.0.0
 accelerate==1.1.0
+pydantic
 markdown2[all]
+numpy<2
 scikit-learn==1.2.2
 requests
 httpx
 uvicorn
+fastapi
 einops==0.6.1
 einops-exts==0.0.4
 timm==1.0.11
 transformers_stream_generator==0.0.4
 scipy
 pandas
+torchaudio==2.10.0
 xformers
 pillow==10.3.0
 pysubs2==1.7.2