Spaces:

AIDC-AI
/

Ovis-U1-3B

Running on Zero

App Files Files Community

[Admin maintenance] Support new ZeroGPU hardware

by multimodalart HF Staff - opened 17 days ago

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+247

-7

Files changed (7) hide show

app.py +0 -2
flash_attn/__init__.py +24 -0
flash_attn/flash_attn_interface.py +112 -0
flash_attn/funcs.py +54 -0
flash_attn/layers/__init__.py +0 -0
flash_attn/layers/rotary.py +51 -0
requirements.txt +6 -5

app.py CHANGED Viewed

@@ -1,6 +1,4 @@
 import os
-import subprocess
-subprocess.run('pip install flash-attn==2.6.3 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 import random
 import spaces
 import numpy as np

 import os
 import random
 import spaces
 import numpy as np

flash_attn/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Minimal torch-native shim for flash_attn used by AIDC-AI/Ovis-U1-3B.
+The upstream modeling file imports:
+    from flash_attn.layers.rotary import apply_rotary_emb
+    from flash_attn import flash_attn_varlen_func
+Blackwell/CUDA-13 has no flash-attn prebuilt wheel for cp310+torch>=2.10, and the
+package's CUDA build doesn't fit within the @spaces.GPU 1500s budget, so we
+provide a small torch-native equivalent that satisfies the two call sites the
+model actually exercises.
+We also fake a version string within the range xformers tolerates so that
+``xformers/ops/fmha/flash.py`` (loaded transitively by ``diffusers``) does not
+explode at import time. The xformers FA backend it then registers will never
+be invoked along the user-facing demo path (the model uses transformers SDPA
+attention + this shim's varlen path; diffusers' xformers backend is only
+engaged via an explicit ``set_use_memory_efficient_attention_xformers`` opt-in
+which the demo never makes).
+"""
+__version__ = "2.8.3"
+from .funcs import flash_attn_varlen_func  # noqa: F401
+from . import flash_attn_interface  # noqa: F401  -- expose submodule eagerly

flash_attn/flash_attn_interface.py ADDED Viewed

	@@ -0,0 +1,112 @@

+"""Stub of flash_attn.flash_attn_interface.
+Re-exports the torch-native ``flash_attn_varlen_func`` from the shim plus a
+``flash_attn_func`` fallback that uses ``torch.nn.functional.scaled_dot_product_attention``
+for the padded (batch, seqlen, nheads, headdim) call signature.
+Also exposes a placeholder ``flash_attn_gpu`` object so xformers'
+``hasattr(flash_attn.flash_attn_interface, "flash_attn_gpu")`` probe in
+``xformers/ops/fmha/flash.py`` succeeds. The backend xformers registers from
+this probe is never invoked along the demo's user-facing path.
+"""
+import torch
+import torch.nn.functional as F
+from .funcs import flash_attn_varlen_func  # noqa: F401
+class _UnavailableBackend:
+    """Opaque placeholder; calling any attribute raises a clear error."""
+    def __getattr__(self, name):
+        raise RuntimeError(
+            "flash_attn shim: real CUDA backend is not installed. "
+            "The demo's user-facing path should not need it."
+        )
+flash_attn_gpu = _UnavailableBackend()
+flash_attn_cuda = _UnavailableBackend()
+def flash_attn_func(
+    q,
+    k,
+    v,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """Padded attention. q/k/v shape: (B, L, H, D). Returns (B, L, H, D)."""
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** -0.5
+    # SDPA expects (B, H, L, D)
+    q_t = q.transpose(1, 2)
+    k_t = k.transpose(1, 2)
+    v_t = v.transpose(1, 2)
+    out = F.scaled_dot_product_attention(
+        q_t, k_t, v_t,
+        dropout_p=dropout_p,
+        is_causal=causal,
+        scale=softmax_scale,
+    )
+    return out.transpose(1, 2)
+def flash_attn_qkvpacked_func(
+    qkv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """qkv shape: (B, L, 3, H, D)."""
+    q, k, v = qkv.unbind(dim=2)
+    return flash_attn_func(
+        q, k, v,
+        dropout_p=dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=return_attn_probs,
+    )
+def flash_attn_kvpacked_func(
+    q,
+    kv,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    """q shape: (B, Lq, H, D), kv shape: (B, Lk, 2, H, D)."""
+    k, v = kv.unbind(dim=2)
+    return flash_attn_func(
+        q, k, v,
+        dropout_p=dropout_p,
+        softmax_scale=softmax_scale,
+        causal=causal,
+        window_size=window_size,
+        softcap=softcap,
+        alibi_slopes=alibi_slopes,
+        deterministic=deterministic,
+        return_attn_probs=return_attn_probs,
+    )

flash_attn/funcs.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""Torch-native equivalent of flash_attn.flash_attn_varlen_func.
+Only the forward path used by AIMv2's vision tower is implemented:
+    flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k)
+q, k, v are packed (total_tokens, num_heads, head_dim) tensors and the function
+returns the same packed (total_tokens, num_heads, head_dim) shape after applying
+self-attention per sub-sequence as encoded by `cu_seqlens_q == cu_seqlens_k`.
+"""
+import torch
+import torch.nn.functional as F
+def flash_attn_varlen_func(
+    q,
+    k,
+    v,
+    cu_seqlens_q,
+    cu_seqlens_k,
+    max_seqlen_q,
+    max_seqlen_k,
+    dropout_p=0.0,
+    softmax_scale=None,
+    causal=False,
+    window_size=(-1, -1),
+    softcap=0.0,
+    alibi_slopes=None,
+    deterministic=False,
+    return_attn_probs=False,
+):
+    if softmax_scale is None:
+        softmax_scale = q.shape[-1] ** -0.5
+    cu_q = cu_seqlens_q.tolist()
+    cu_k = cu_seqlens_k.tolist()
+    out_chunks = []
+    for i in range(len(cu_q) - 1):
+        sq, eq = cu_q[i], cu_q[i + 1]
+        sk, ek = cu_k[i], cu_k[i + 1]
+        q_i = q[sq:eq].transpose(0, 1).unsqueeze(0)  # (1, H, Lq, D)
+        k_i = k[sk:ek].transpose(0, 1).unsqueeze(0)
+        v_i = v[sk:ek].transpose(0, 1).unsqueeze(0)
+        o_i = F.scaled_dot_product_attention(
+            q_i, k_i, v_i,
+            dropout_p=dropout_p,
+            is_causal=causal,
+            scale=softmax_scale,
+        )
+        out_chunks.append(o_i.squeeze(0).transpose(0, 1))  # (Lq, H, D)
+    out = torch.cat(out_chunks, dim=0)
+    return out

flash_attn/layers/__init__.py ADDED Viewed

File without changes

flash_attn/layers/rotary.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Torch-native equivalent of flash_attn.layers.rotary.apply_rotary_emb.
+Mirrors the flash_attn `apply_rotary_emb_torch` reference implementation:
+    x: (batch_size, seqlen, nheads, headdim)
+    cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+"""
+import torch
+def _rotate_half(x, interleaved=False):
+    if not interleaved:
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    x1, x2 = x[..., ::2], x[..., 1::2]
+    out = torch.stack((-x2, x1), dim=-1)
+    return out.flatten(-2)
+def apply_rotary_emb(
+    x,
+    cos,
+    sin,
+    interleaved=False,
+    inplace=False,
+    seqlen_offsets=0,
+    cu_seqlens=None,
+    max_seqlen=None,
+):
+    """Pure-torch rotary embedding application.
+    The Ovis aimv2 call site uses the simple case: no `cu_seqlens`, no
+    `seqlen_offsets`, default `interleaved=False`.
+    """
+    ro_dim = cos.shape[-1] * 2
+    assert ro_dim <= x.shape[-1], f"rotary dim {ro_dim} exceeds head dim {x.shape[-1]}"
+    # Broadcast cos/sin from (..., rotary_dim/2) up to (..., 1, rotary_dim)
+    if interleaved:
+        cos = cos.unsqueeze(-2).repeat_interleave(2, dim=-1)
+        sin = sin.unsqueeze(-2).repeat_interleave(2, dim=-1)
+    else:
+        cos = cos.unsqueeze(-2)
+        sin = sin.unsqueeze(-2)
+        cos = torch.cat([cos, cos], dim=-1)
+        sin = torch.cat([sin, sin], dim=-1)
+    x_rot = x[..., :ro_dim]
+    x_pass = x[..., ro_dim:]
+    out_rot = x_rot * cos + _rotate_half(x_rot, interleaved) * sin
+    return torch.cat([out_rot, x_pass], dim=-1)

requirements.txt CHANGED Viewed

@@ -1,17 +1,18 @@
-torch==2.4.0
 transformers==4.51.3
 tokenizers==0.21.1
 sentencepiece==0.1.99
 pyarrow==18.0.0
 accelerate==1.1.0
-pydantic==2.8.2
 markdown2[all]
-numpy==1.24.3
 scikit-learn==1.2.2
 requests
 httpx
 uvicorn
-fastapi==0.112.4
 einops==0.6.1
 einops-exts==0.0.4
 timm==1.0.11
@@ -19,7 +20,7 @@ tiktoken
 transformers_stream_generator==0.0.4
 scipy
 pandas
-torchaudio
 xformers
 pillow==10.3.0
 pysubs2==1.7.2

+torch==2.10.0
+torchvision==0.25.0
 transformers==4.51.3
 tokenizers==0.21.1
 sentencepiece==0.1.99
 pyarrow==18.0.0
 accelerate==1.1.0
+pydantic
 markdown2[all]
+numpy<2
 scikit-learn==1.2.2
 requests
 httpx
 uvicorn
+fastapi
 einops==0.6.1
 einops-exts==0.0.4
 timm==1.0.11
 transformers_stream_generator==0.0.4
 scipy
 pandas
+torchaudio==2.10.0
 xformers
 pillow==10.3.0
 pysubs2==1.7.2