Spaces:

BlinkDL
/

RWKV-Gradio-1

Running on T4

App Files Files Community

NeverlandPeter commited on 29 days ago

Commit

45d682f

1 Parent(s): c954ce5

faster

Browse files

Files changed (5) hide show

app.py +47 -38
cuda/rwkv7_fast_ops_fp16.cu +5 -4
cuda/rwkv7_v3a_ops.cpp +21 -18
cuda/rwkv7_v3a_ops.cu +291 -47
rwkv7_fast_v3a.py +184 -16

app.py CHANGED Viewed

@@ -1,45 +1,49 @@
-import os, copy
-os.environ["RWKV_V7_ON"] = '1'
-os.environ["RWKV_JIT_ON"] = '1'
-os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
-from rwkv.model import RWKV
-import gc, re
 import gradio as gr
-import base64
-from io import BytesIO
 import torch
-import torch.nn.functional as F
 from datetime import datetime
 from huggingface_hub import hf_hub_download
 from pynvml import *
 nvmlInit()
-gpu_h = nvmlDeviceGetHandleByIndex(0)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ctx_limit = 7000
 gen_limit = 1000
 ########################## text rwkv ################################################################
-from rwkv.utils import PIPELINE, PIPELINE_ARGS
 title = "rwkv7-g1f-2.9b-20260420-ctx8192"
 model_path = hf_hub_download(repo_id="BlinkDL/rwkv7-g1", filename=f"{title}.pth")
-model = RWKV(model=model_path.replace('.pth',''), strategy='cuda fp16')
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
-args = model.args
-_, _ = model.forward([0], None)
-state = model.generate_zero_state()
-static_input = torch.empty((model.n_embd), device="cuda", dtype=torch.half)
-static_state_in = [torch.empty_like(x, device="cuda") for x in state]
-static_state_out = [torch.empty_like(x, device="cuda") for x in state]
-static_output = torch.empty((model.args.vocab_size), device="cuda", dtype=torch.half)
-graph = torch.cuda.CUDAGraph()
-with torch.cuda.graph(graph):
-    static_output, static_state_out = model.forward_one_alt(static_input, static_state_in)
 def generate_prompt(instruction, input=""):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
@@ -73,25 +77,30 @@ def evaluate(
     out_last = 0
     out_str = ''
     occurrence = {}
-    state = None
     for i in range(int(token_count)):
         if i == 0:
             input_ids = pipeline.encode(ctx)[-ctx_limit:]
-            out, state = model.forward(input_ids, state)
-            for j in range(len(state)):
-                static_state_in[j].copy_(state[j])
-            static_output.copy_(out)
         else:
-            static_input.copy_(model.z['emb.weight'][token])
-            graph.replay()
-            for j in range(len(state)):
-                static_state_in[j].copy_(static_state_out[j])
         for n in occurrence:
-            static_output[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
-        token = pipeline.sample_logits(static_output, temperature=args.temperature, top_p=args.top_p)
         if token in args.token_stop:
             break
         all_tokens += [token]
@@ -168,4 +177,4 @@ with gr.Blocks(title=title, theme=gr.themes.Base()) as demo:
         data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty, penalty_decay])
 demo.queue(default_concurrency_limit=1, max_size=10)
-demo.launch(share=False)

+import gc, os, re
 import gradio as gr
 import torch
 from datetime import datetime
 from huggingface_hub import hf_hub_download
 from pynvml import *
+from rwkv.utils import PIPELINE, PIPELINE_ARGS
+import rwkv7_fast_v3a as v3a
 nvmlInit()
+# gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 7000
 gen_limit = 1000
 ########################## text rwkv ################################################################
 title = "rwkv7-g1f-2.9b-20260420-ctx8192"
 model_path = hf_hub_download(repo_id="BlinkDL/rwkv7-g1", filename=f"{title}.pth")
+# model_path = "/dev/shm/rwkv7-g1f-7.2b-20260414-ctx8192.pth"
+v3a.MODEL_PATH = model_path
+v3a.WKV_MODE = "fp32io16"
+v3a.EMB_DEVICE = "cpu"
+v3a.RKV_MODE = "off"
+v3a.CMIX_SPARSE = "no-fc"
+v3a.LOWRANK_WEIGHT = "transpose"
+v3a.ORIG_LINEAR_GROUPS = {"att_c2c", "ffn_key", "head"}
+v3a.load_extensions(v3a.WKV_MODE)
+model = v3a.RWKV7()
 pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
+decode_state = model.zero_state(1)
+decode_x = torch.empty((1, 1, v3a.C), device="cuda", dtype=torch.half)
+decode_path = v3a.select_path(1, 1)
+for _ in range(2):
+    model.forward_from_x(decode_x, decode_state, decode_path)
+torch.cuda.synchronize()
+decode_graph = torch.cuda.CUDAGraph()
+with torch.cuda.graph(decode_graph):
+    decode_output = model.forward_from_x(decode_x, decode_state, decode_path)
+def token_to_x(token: int):
+    token_tensor = torch.tensor([[int(token)]], dtype=torch.long, device="cpu" if model.emb_cpu else "cuda")
+    return model.embed(token_tensor)
 def generate_prompt(instruction, input=""):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
     out_last = 0
     out_str = ''
     occurrence = {}
+    state = model.zero_state(1)
+    out = None
     for i in range(int(token_count)):
         if i == 0:
             input_ids = pipeline.encode(ctx)[-ctx_limit:]
+            CHUNK_LEN = 8192 # chunk prefill, save VRAM
+            while len(input_ids) > 0:
+                token_device = "cpu" if model.emb_cpu else "cuda"
+                tokens = torch.tensor(input_ids[:CHUNK_LEN], dtype=torch.long, device=token_device)
+                out = model.forward(tokens, state).view(-1)
+                input_ids = input_ids[CHUNK_LEN:]
+            for dst, src in zip(decode_state, state):
+                dst.copy_(src)
+            logits = out
         else:
+            decode_x.copy_(token_to_x(token))
+            decode_graph.replay()
+            logits = decode_output.view(-1)
         for n in occurrence:
+            logits[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
+        token = pipeline.sample_logits(logits, temperature=args.temperature, top_p=args.top_p)
         if token in args.token_stop:
             break
         all_tokens += [token]
         data.click(lambda x: x, [data], [prompt, token_count, temperature, top_p, presence_penalty, count_penalty, penalty_decay])
 demo.queue(default_concurrency_limit=1, max_size=10)
+demo.launch(share=False, server_name="0.0.0.0")

cuda/rwkv7_fast_ops_fp16.cu CHANGED Viewed

@@ -12,7 +12,8 @@ namespace {
 constexpr int HEAD_SIZE = 64;
 constexpr int WARPS_PER_BLOCK = 4;
-constexpr float NORM_EPS = 1.0e-12f;
 constexpr int FFN_SPMV_THREADS = 128;
 constexpr int FFN_TILE = 128;
@@ -202,7 +203,7 @@ __global__ void tmix_kk_a_gate_kernel(
   float sum_sq = u0 * u0 + u1 * u1;
   sum_sq = warp_sum(sum_sq);
   const float total = __shfl_sync(0xffffffffu, sum_sq, 0);
-  const float inv_d = 1.0f / fmaxf(sqrtf(total), NORM_EPS);
   const float kk0 = u0 * inv_d;
   const float kk1 = u1 * inv_d;
@@ -264,7 +265,7 @@ __global__ void tmix_lnx_rkvres_xg_kernel(
   }
   __syncthreads();
   const float var = (partial[0] + partial[1]) * (1.0f / 64.0f);
-  const float rstd = rsqrtf(var + 64.0e-5f);
   __syncthreads();
   const float rv = load_h1(r + idx);
@@ -296,7 +297,7 @@ __global__ void tmix_vres_gate_kernel(
   if (idx >= total) {
     return;
   }
-  const int c = static_cast<int>(idx & (static_cast<int64_t>(C) - 1));
   const float vv = load_h1(v + idx);
   const float gate = sigmoid_fast(load_h1(v0 + c) + load_h1(v12 + idx));
   store_h1(out + idx, fmaf(load_h1(v_first + idx) - vv, gate, vv));

 constexpr int HEAD_SIZE = 64;
 constexpr int WARPS_PER_BLOCK = 4;
+constexpr float KK_NORMALIZE_EPS = 1.0e-12f;
+constexpr float TMIX_LN_X_EPS = 64.0e-5f;
 constexpr int FFN_SPMV_THREADS = 128;
 constexpr int FFN_TILE = 128;
   float sum_sq = u0 * u0 + u1 * u1;
   sum_sq = warp_sum(sum_sq);
   const float total = __shfl_sync(0xffffffffu, sum_sq, 0);
+  const float inv_d = 1.0f / fmaxf(sqrtf(total), KK_NORMALIZE_EPS);
   const float kk0 = u0 * inv_d;
   const float kk1 = u1 * inv_d;
   }
   __syncthreads();
   const float var = (partial[0] + partial[1]) * (1.0f / 64.0f);
+  const float rstd = rsqrtf(var + TMIX_LN_X_EPS);
   __syncthreads();
   const float rv = load_h1(r + idx);
   if (idx >= total) {
     return;
   }
+  const int c = static_cast<int>(idx % static_cast<int64_t>(C));
   const float vv = load_h1(v + idx);
   const float gate = sigmoid_fast(load_h1(v0 + c) + load_h1(v12 + idx));
   store_h1(out + idx, fmaf(load_h1(v_first + idx) - vv, gate, vv));

cuda/rwkv7_v3a_ops.cpp CHANGED Viewed

@@ -1,6 +1,8 @@
 #include <torch/extension.h>
 #include <vector>
 torch::Tensor layer_norm_f16_cuda(torch::Tensor x, torch::Tensor weight, torch::Tensor bias, double eps);
 torch::Tensor emb_ln0_bf16_to_f16_cuda(torch::Tensor emb, torch::Tensor weight, torch::Tensor bias, double eps);
 torch::Tensor layer_norm_f16_small_cuda(torch::Tensor x, torch::Tensor weight, torch::Tensor bias, double eps);
@@ -93,7 +95,6 @@ torch::Tensor emb_ln0_bf16_to_f16(torch::Tensor emb, torch::Tensor weight, torch
   check_bf16_cuda_contig(bias, "bias");
   TORCH_CHECK(emb.dim() == 2, "emb must have shape [V, C]");
   const int64_t c = emb.size(1);
-  TORCH_CHECK(c == 4096, "emb_ln0_bf16_to_f16 currently requires C=4096");
   TORCH_CHECK(weight.dim() == 1 && weight.size(0) == c, "weight shape mismatch");
   TORCH_CHECK(bias.dim() == 1 && bias.size(0) == c, "bias shape mismatch");
   return emb_ln0_bf16_to_f16_cuda(emb, weight, bias, eps);
@@ -436,7 +437,7 @@ std::vector<torch::Tensor> add_layer_norm_cmix_mix_f16(torch::Tensor x, torch::T
   TORCH_CHECK(x.sizes() == residual.sizes(), "add_layer_norm_cmix_mix_f16 x/residual shape mismatch");
   TORCH_CHECK(x.dim() == 3 && x.size(1) == 1, "add_layer_norm_cmix_mix_f16 requires shape [B,1,C]");
   const int64_t c = x.size(2);
-  TORCH_CHECK(c == 4096, "add_layer_norm_cmix_mix_f16 currently requires C=4096");
   TORCH_CHECK(shift_state.dim() == 2 && shift_state.size(0) == x.size(0) && shift_state.size(1) == c,
               "shift_state shape mismatch");
   TORCH_CHECK(weight.dim() == 1 && weight.size(0) == c, "weight shape mismatch");
@@ -470,13 +471,15 @@ std::vector<torch::Tensor> add_layer_norm_tmix_mix6_f16(
   check_half_cuda_contig(x_a, "x_a");
   check_half_cuda_contig(x_g, "x_g");
   TORCH_CHECK(x.sizes() == residual.sizes(), "add_layer_norm_tmix_mix6_f16 x/residual shape mismatch");
-  TORCH_CHECK(x.dim() == 3 && x.size(1) == 1 && x.size(2) == 4096, "add_layer_norm_tmix_mix6_f16 requires shape [B,1,4096]");
-  TORCH_CHECK(shift_state.dim() == 2 && shift_state.size(0) == x.size(0) && shift_state.size(1) == 4096,
               "shift_state shape mismatch");
-  TORCH_CHECK(weight.dim() == 1 && weight.size(0) == 4096, "weight shape mismatch");
-  TORCH_CHECK(bias.dim() == 1 && bias.size(0) == 4096, "bias shape mismatch");
-  TORCH_CHECK(x_r.numel() == 4096 && x_w.numel() == 4096 && x_k.numel() == 4096 &&
-              x_v.numel() == 4096 && x_a.numel() == 4096 && x_g.numel() == 4096,
               "mix vector shape mismatch");
   return add_layer_norm_tmix_mix6_f16_cuda(
       x, residual, shift_state, weight, bias, x_r, x_w, x_k, x_v, x_a, x_g, eps);
@@ -602,10 +605,10 @@ void advance_i32(torch::Tensor x, int64_t amount) {
 } // namespace
 TORCH_LIBRARY(rwkv7_v3a_ops, m) {
-  m.def("layer_norm_f16(Tensor x, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor");
-  m.def("emb_ln0_bf16_to_f16(Tensor emb, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor");
-  m.def("layer_norm_f16_small(Tensor x, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor");
-  m.def("layer_norm_f16_small512(Tensor x, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor");
   m.def("linear_f16(Tensor x, Tensor weight) -> Tensor");
   m.def("linear_f16_orig(Tensor x, Tensor weight_orig) -> Tensor");
   m.def("linear_orig_rows_f16(Tensor x, Tensor weight_orig, int row_tile, int out_tile) -> Tensor");
@@ -628,14 +631,14 @@ TORCH_LIBRARY(rwkv7_v3a_ops, m) {
   m.def("linear_wag_rank_out_f16(Tensor w1, Tensor a1, Tensor g1, Tensor w2_t, Tensor a2_t, Tensor g2_t) -> Tensor[]");
   m.def("linear_wagv_rank_out_f16(Tensor w1, Tensor a1, Tensor g1, Tensor v1, Tensor w2_t, Tensor a2_t, Tensor g2_t, Tensor v2_t, Tensor v, Tensor v_first, Tensor v0) -> Tensor[]");
   m.def("add_f16(Tensor x, Tensor y) -> Tensor");
-  m.def("add_layer_norm_f16(Tensor x, Tensor residual, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor[]");
-  m.def("add_last_layer_norm_f16(Tensor x, Tensor residual, Tensor weight, Tensor bias, float eps=1e-5) -> Tensor");
-  m.def("add_layer_norm_cmix_mix_f16(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps=1e-5) -> Tensor[]");
-  m.def("add_layer_norm_tmix_mix6_f16(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps=1e-5) -> Tensor[]");
   m.def("add_layer_norm_tmix_mix6_f16_cfg(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps, int threads) -> Tensor[]");
-  m.def("add_layer_norm_tmix_mix6_f16_scalar_stats(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps=1e-5) -> Tensor[]");
   m.def("add_layer_norm_cmix_mix_f16_cfg(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps, int threads) -> Tensor[]");
-  m.def("add_layer_norm_cmix_mix_f16_scalar_stats(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps=1e-5) -> Tensor[]");
   m.def("advance_i32(Tensor(a!) x, int amount) -> ()");
 }

 #include <torch/extension.h>
 #include <vector>
+#define RWKV7_LAYER_NORM_EPS_SCHEMA "1e-5"
 torch::Tensor layer_norm_f16_cuda(torch::Tensor x, torch::Tensor weight, torch::Tensor bias, double eps);
 torch::Tensor emb_ln0_bf16_to_f16_cuda(torch::Tensor emb, torch::Tensor weight, torch::Tensor bias, double eps);
 torch::Tensor layer_norm_f16_small_cuda(torch::Tensor x, torch::Tensor weight, torch::Tensor bias, double eps);
   check_bf16_cuda_contig(bias, "bias");
   TORCH_CHECK(emb.dim() == 2, "emb must have shape [V, C]");
   const int64_t c = emb.size(1);
   TORCH_CHECK(weight.dim() == 1 && weight.size(0) == c, "weight shape mismatch");
   TORCH_CHECK(bias.dim() == 1 && bias.size(0) == c, "bias shape mismatch");
   return emb_ln0_bf16_to_f16_cuda(emb, weight, bias, eps);
   TORCH_CHECK(x.sizes() == residual.sizes(), "add_layer_norm_cmix_mix_f16 x/residual shape mismatch");
   TORCH_CHECK(x.dim() == 3 && x.size(1) == 1, "add_layer_norm_cmix_mix_f16 requires shape [B,1,C]");
   const int64_t c = x.size(2);
+  TORCH_CHECK((c % 2) == 0 && c > 0 && c <= 8192, "unsupported C");
   TORCH_CHECK(shift_state.dim() == 2 && shift_state.size(0) == x.size(0) && shift_state.size(1) == c,
               "shift_state shape mismatch");
   TORCH_CHECK(weight.dim() == 1 && weight.size(0) == c, "weight shape mismatch");
   check_half_cuda_contig(x_a, "x_a");
   check_half_cuda_contig(x_g, "x_g");
   TORCH_CHECK(x.sizes() == residual.sizes(), "add_layer_norm_tmix_mix6_f16 x/residual shape mismatch");
+  TORCH_CHECK(x.dim() == 3 && x.size(1) == 1, "add_layer_norm_tmix_mix6_f16 requires shape [B,1,C]");
+  const int64_t c = x.size(2);
+  TORCH_CHECK((c % 2) == 0 && c > 0 && c <= 8192, "unsupported C");
+  TORCH_CHECK(shift_state.dim() == 2 && shift_state.size(0) == x.size(0) && shift_state.size(1) == c,
               "shift_state shape mismatch");
+  TORCH_CHECK(weight.dim() == 1 && weight.size(0) == c, "weight shape mismatch");
+  TORCH_CHECK(bias.dim() == 1 && bias.size(0) == c, "bias shape mismatch");
+  TORCH_CHECK(x_r.numel() == c && x_w.numel() == c && x_k.numel() == c &&
+              x_v.numel() == c && x_a.numel() == c && x_g.numel() == c,
               "mix vector shape mismatch");
   return add_layer_norm_tmix_mix6_f16_cuda(
       x, residual, shift_state, weight, bias, x_r, x_w, x_k, x_v, x_a, x_g, eps);
 } // namespace
 TORCH_LIBRARY(rwkv7_v3a_ops, m) {
+  m.def("layer_norm_f16(Tensor x, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor");
+  m.def("emb_ln0_bf16_to_f16(Tensor emb, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor");
+  m.def("layer_norm_f16_small(Tensor x, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor");
+  m.def("layer_norm_f16_small512(Tensor x, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor");
   m.def("linear_f16(Tensor x, Tensor weight) -> Tensor");
   m.def("linear_f16_orig(Tensor x, Tensor weight_orig) -> Tensor");
   m.def("linear_orig_rows_f16(Tensor x, Tensor weight_orig, int row_tile, int out_tile) -> Tensor");
   m.def("linear_wag_rank_out_f16(Tensor w1, Tensor a1, Tensor g1, Tensor w2_t, Tensor a2_t, Tensor g2_t) -> Tensor[]");
   m.def("linear_wagv_rank_out_f16(Tensor w1, Tensor a1, Tensor g1, Tensor v1, Tensor w2_t, Tensor a2_t, Tensor g2_t, Tensor v2_t, Tensor v, Tensor v_first, Tensor v0) -> Tensor[]");
   m.def("add_f16(Tensor x, Tensor y) -> Tensor");
+  m.def("add_layer_norm_f16(Tensor x, Tensor residual, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor[]");
+  m.def("add_last_layer_norm_f16(Tensor x, Tensor residual, Tensor weight, Tensor bias, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor");
+  m.def("add_layer_norm_cmix_mix_f16(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor[]");
+  m.def("add_layer_norm_tmix_mix6_f16(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor[]");
   m.def("add_layer_norm_tmix_mix6_f16_cfg(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps, int threads) -> Tensor[]");
+  m.def("add_layer_norm_tmix_mix6_f16_scalar_stats(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_r, Tensor x_w, Tensor x_k, Tensor x_v, Tensor x_a, Tensor x_g, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor[]");
   m.def("add_layer_norm_cmix_mix_f16_cfg(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps, int threads) -> Tensor[]");
+  m.def("add_layer_norm_cmix_mix_f16_scalar_stats(Tensor x, Tensor residual, Tensor(a!) shift_state, Tensor weight, Tensor bias, Tensor x_k, float eps=" RWKV7_LAYER_NORM_EPS_SCHEMA ") -> Tensor[]");
   m.def("advance_i32(Tensor(a!) x, int amount) -> ()");
 }

cuda/rwkv7_v3a_ops.cu CHANGED Viewed

@@ -1990,6 +1990,182 @@ __global__ __launch_bounds__(Threads, 1) void add_last_layer_norm_f16_small_kern
   }
 }
 } // namespace
 at::Tensor add_f16_cuda(at::Tensor x, at::Tensor y) {
@@ -2154,9 +2330,16 @@ at::Tensor add_last_layer_norm_f16_cuda(at::Tensor x, at::Tensor residual, at::T
   const int64_t B = x.size(0);
   const int64_t T = x.size(1);
   const int64_t C = x.size(2);
-  TORCH_CHECK(C == LN_SMALL_C, "add_last_layer_norm_f16 currently requires C=4096");
   auto y = at::empty({B, C}, x.options());
   auto stream = at::cuda::getCurrentCUDAStream();
   if (B >= 1024) {
     add_last_layer_norm_f16_small_kernel<LN_SMALL512_THREADS, true, true><<<static_cast<int>(B), LN_SMALL512_THREADS, 0, stream>>>(
         x.data_ptr<dtype>(), residual.data_ptr<dtype>(), weight.data_ptr<dtype>(), bias.data_ptr<dtype>(),
@@ -2184,19 +2367,36 @@ std::vector<at::Tensor> add_layer_norm_cmix_mix_f16_cuda(
     double eps) {
   auto x_out = at::empty_like(x);
   auto mixed = at::empty_like(x);
-  const int64_t rows = x.numel() / LN_SMALL_C;
   auto stream = at::cuda::getCurrentCUDAStream();
-  add_layer_norm_cmix_mix_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
-      x.data_ptr<dtype>(),
-      residual.data_ptr<dtype>(),
-      shift_state.data_ptr<dtype>(),
-      weight.data_ptr<dtype>(),
-      bias.data_ptr<dtype>(),
-      x_k.data_ptr<dtype>(),
-      x_out.data_ptr<dtype>(),
-      mixed.data_ptr<dtype>(),
-      rows,
-      static_cast<float>(eps));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, mixed};
 }
@@ -2211,19 +2411,36 @@ std::vector<at::Tensor> add_layer_norm_cmix_mix_f16_scalar_stats_cuda(
     double eps) {
   auto x_out = at::empty_like(x);
   auto mixed = at::empty_like(x);
-  const int64_t rows = x.numel() / LN_SMALL_C;
   auto stream = at::cuda::getCurrentCUDAStream();
-  add_layer_norm_cmix_mix_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
-      x.data_ptr<dtype>(),
-      residual.data_ptr<dtype>(),
-      shift_state.data_ptr<dtype>(),
-      weight.data_ptr<dtype>(),
-      bias.data_ptr<dtype>(),
-      x_k.data_ptr<dtype>(),
-      x_out.data_ptr<dtype>(),
-      mixed.data_ptr<dtype>(),
-      rows,
-      static_cast<float>(eps));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, mixed};
 }
@@ -2248,29 +2465,56 @@ std::vector<at::Tensor> add_layer_norm_tmix_mix6_f16_cuda(
   auto out_v = at::empty_like(x);
   auto out_a = at::empty_like(x);
   auto out_g = at::empty_like(x);
-  const int64_t rows = x.numel() / LN_SMALL_C;
   auto stream = at::cuda::getCurrentCUDAStream();
-  add_layer_norm_tmix_mix6_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
-      x.data_ptr<dtype>(),
-      residual.data_ptr<dtype>(),
-      shift_state.data_ptr<dtype>(),
-      weight.data_ptr<dtype>(),
-      bias.data_ptr<dtype>(),
-      x_r.data_ptr<dtype>(),
-      x_w.data_ptr<dtype>(),
-      x_k.data_ptr<dtype>(),
-      x_v.data_ptr<dtype>(),
-      x_a.data_ptr<dtype>(),
-      x_g.data_ptr<dtype>(),
-      x_out.data_ptr<dtype>(),
-      out_r.data_ptr<dtype>(),
-      out_w.data_ptr<dtype>(),
-      out_k.data_ptr<dtype>(),
-      out_v.data_ptr<dtype>(),
-      out_a.data_ptr<dtype>(),
-      out_g.data_ptr<dtype>(),
-      rows,
-      static_cast<float>(eps));
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, out_r, out_w, out_k, out_v, out_a, out_g};
 }

   }
 }
+template <int Threads>
+__global__ __launch_bounds__(Threads, 1) void add_last_layer_norm_f16_generic_kernel(
+    const dtype* __restrict__ x,
+    const dtype* __restrict__ residual,
+    const dtype* __restrict__ weight,
+    const dtype* __restrict__ bias,
+    dtype* __restrict__ y,
+    int64_t B,
+    int64_t T,
+    int C,
+    float eps) {
+  const int64_t bidx = blockIdx.x;
+  if (bidx >= B) {
+    return;
+  }
+  const int64_t src = (bidx * T + (T - 1)) * static_cast<int64_t>(C);
+  const int64_t dst = bidx * static_cast<int64_t>(C);
+  float sum = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    sum += __half2float(*reinterpret_cast<const __half*>(x + src + c)) +
+           __half2float(*reinterpret_cast<const __half*>(residual + src + c));
+  }
+  sum = block_sum_t<Threads>(sum);
+  const float mean = sum / static_cast<float>(C);
+  float sum_var = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    const float v = __half2float(*reinterpret_cast<const __half*>(x + src + c)) +
+                    __half2float(*reinterpret_cast<const __half*>(residual + src + c));
+    const float d = v - mean;
+    sum_var += d * d;
+  }
+  sum_var = block_sum_t<Threads>(sum_var);
+  const float rstd = rsqrtf(sum_var / static_cast<float>(C) + eps);
+  const int pairs = C >> 1;
+  for (int p = threadIdx.x; p < pairs; p += Threads) {
+    const float2 xv = __half22float2(reinterpret_cast<const __half2*>(x + src)[p]);
+    const float2 rv = __half22float2(reinterpret_cast<const __half2*>(residual + src)[p]);
+    const float sx = xv.x + rv.x;
+    const float sy = xv.y + rv.y;
+    const float2 w = __half22float2(reinterpret_cast<const __half2*>(weight)[p]);
+    const float2 bb = __half22float2(reinterpret_cast<const __half2*>(bias)[p]);
+    reinterpret_cast<__half2*>(y + dst)[p] = __floats2half2_rn(
+        (sx - mean) * rstd * w.x + bb.x,
+        (sy - mean) * rstd * w.y + bb.y);
+  }
+}
+template <int Threads>
+__global__ __launch_bounds__(Threads, 1) void add_layer_norm_cmix_mix_f16_generic_kernel(
+    const dtype* __restrict__ x,
+    const dtype* __restrict__ residual,
+    dtype* __restrict__ shift_state,
+    const dtype* __restrict__ weight,
+    const dtype* __restrict__ bias,
+    const dtype* __restrict__ x_k,
+    dtype* __restrict__ x_out,
+    dtype* __restrict__ mixed,
+    int64_t rows,
+    int C,
+    float eps) {
+  const int64_t row = blockIdx.x;
+  if (row >= rows) {
+    return;
+  }
+  const int64_t base = row * static_cast<int64_t>(C);
+  float sum = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    sum += __half2float(*reinterpret_cast<const __half*>(x + base + c)) +
+           __half2float(*reinterpret_cast<const __half*>(residual + base + c));
+  }
+  sum = block_sum_t<Threads>(sum);
+  const float mean = sum / static_cast<float>(C);
+  float sum_var = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    const float v = __half2float(*reinterpret_cast<const __half*>(x + base + c)) +
+                    __half2float(*reinterpret_cast<const __half*>(residual + base + c));
+    const float d = v - mean;
+    sum_var += d * d;
+  }
+  sum_var = block_sum_t<Threads>(sum_var);
+  const float rstd = rsqrtf(sum_var / static_cast<float>(C) + eps);
+  const int pairs = C >> 1;
+  const int64_t base2 = base >> 1;
+  for (int p = threadIdx.x; p < pairs; p += Threads) {
+    const float2 xv = __half22float2(reinterpret_cast<const __half2*>(x)[base2 + p]);
+    const float2 rv = __half22float2(reinterpret_cast<const __half2*>(residual)[base2 + p]);
+    const float2 w = __half22float2(reinterpret_cast<const __half2*>(weight)[p]);
+    const float2 b = __half22float2(reinterpret_cast<const __half2*>(bias)[p]);
+    const float2 prev = __half22float2(reinterpret_cast<const __half2*>(shift_state)[base2 + p]);
+    const float2 mix = __half22float2(reinterpret_cast<const __half2*>(x_k)[p]);
+    const float x0 = xv.x + rv.x;
+    const float x1 = xv.y + rv.y;
+    const __half2 y2 = __floats2half2_rn((x0 - mean) * rstd * w.x + b.x, (x1 - mean) * rstd * w.y + b.y);
+    const float2 yv = __half22float2(y2);
+    reinterpret_cast<__half2*>(x_out)[base2 + p] = __floats2half2_rn(x0, x1);
+    reinterpret_cast<__half2*>(mixed)[base2 + p] =
+        __floats2half2_rn(yv.x + (prev.x - yv.x) * mix.x, yv.y + (prev.y - yv.y) * mix.y);
+    reinterpret_cast<__half2*>(shift_state)[base2 + p] = y2;
+  }
+}
+template <int Threads>
+__global__ __launch_bounds__(Threads, 1) void add_layer_norm_tmix_mix6_f16_generic_kernel(
+    const dtype* __restrict__ x,
+    const dtype* __restrict__ residual,
+    dtype* __restrict__ shift_state,
+    const dtype* __restrict__ weight,
+    const dtype* __restrict__ bias,
+    const dtype* __restrict__ x_r,
+    const dtype* __restrict__ x_w,
+    const dtype* __restrict__ x_k,
+    const dtype* __restrict__ x_v,
+    const dtype* __restrict__ x_a,
+    const dtype* __restrict__ x_g,
+    dtype* __restrict__ x_out,
+    dtype* __restrict__ out_r,
+    dtype* __restrict__ out_w,
+    dtype* __restrict__ out_k,
+    dtype* __restrict__ out_v,
+    dtype* __restrict__ out_a,
+    dtype* __restrict__ out_g,
+    int64_t rows,
+    int C,
+    float eps) {
+  const int64_t row = blockIdx.x;
+  if (row >= rows) {
+    return;
+  }
+  const int64_t base = row * static_cast<int64_t>(C);
+  float sum = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    sum += __half2float(*reinterpret_cast<const __half*>(x + base + c)) +
+           __half2float(*reinterpret_cast<const __half*>(residual + base + c));
+  }
+  sum = block_sum_t<Threads>(sum);
+  const float mean = sum / static_cast<float>(C);
+  float sum_var = 0.0f;
+  for (int c = threadIdx.x; c < C; c += Threads) {
+    const float v = __half2float(*reinterpret_cast<const __half*>(x + base + c)) +
+                    __half2float(*reinterpret_cast<const __half*>(residual + base + c));
+    const float d = v - mean;
+    sum_var += d * d;
+  }
+  sum_var = block_sum_t<Threads>(sum_var);
+  const float rstd = rsqrtf(sum_var / static_cast<float>(C) + eps);
+  const int pairs = C >> 1;
+  const int64_t base2 = base >> 1;
+  for (int p = threadIdx.x; p < pairs; p += Threads) {
+    const float2 xv = __half22float2(reinterpret_cast<const __half2*>(x)[base2 + p]);
+    const float2 rv = __half22float2(reinterpret_cast<const __half2*>(residual)[base2 + p]);
+    const float2 w = __half22float2(reinterpret_cast<const __half2*>(weight)[p]);
+    const float2 b = __half22float2(reinterpret_cast<const __half2*>(bias)[p]);
+    const float2 prev = __half22float2(reinterpret_cast<const __half2*>(shift_state)[base2 + p]);
+    const float x0 = xv.x + rv.x;
+    const float x1 = xv.y + rv.y;
+    const __half2 y2 = __floats2half2_rn((x0 - mean) * rstd * w.x + b.x, (x1 - mean) * rstd * w.y + b.y);
+    const float2 yv = __half22float2(y2);
+    const float dx0 = prev.x - yv.x;
+    const float dx1 = prev.y - yv.y;
+    const float2 mr = __half22float2(reinterpret_cast<const __half2*>(x_r)[p]);
+    const float2 mw = __half22float2(reinterpret_cast<const __half2*>(x_w)[p]);
+    const float2 mk = __half22float2(reinterpret_cast<const __half2*>(x_k)[p]);
+    const float2 mv = __half22float2(reinterpret_cast<const __half2*>(x_v)[p]);
+    const float2 ma = __half22float2(reinterpret_cast<const __half2*>(x_a)[p]);
+    const float2 mg = __half22float2(reinterpret_cast<const __half2*>(x_g)[p]);
+    reinterpret_cast<__half2*>(x_out)[base2 + p] = __floats2half2_rn(x0, x1);
+    reinterpret_cast<__half2*>(out_r)[base2 + p] = __floats2half2_rn(yv.x + dx0 * mr.x, yv.y + dx1 * mr.y);
+    reinterpret_cast<__half2*>(out_w)[base2 + p] = __floats2half2_rn(yv.x + dx0 * mw.x, yv.y + dx1 * mw.y);
+    reinterpret_cast<__half2*>(out_k)[base2 + p] = __floats2half2_rn(yv.x + dx0 * mk.x, yv.y + dx1 * mk.y);
+    reinterpret_cast<__half2*>(out_v)[base2 + p] = __floats2half2_rn(yv.x + dx0 * mv.x, yv.y + dx1 * mv.y);
+    reinterpret_cast<__half2*>(out_a)[base2 + p] = __floats2half2_rn(yv.x + dx0 * ma.x, yv.y + dx1 * ma.y);
+    reinterpret_cast<__half2*>(out_g)[base2 + p] = __floats2half2_rn(yv.x + dx0 * mg.x, yv.y + dx1 * mg.y);
+    reinterpret_cast<__half2*>(shift_state)[base2 + p] = y2;
+  }
+}
 } // namespace
 at::Tensor add_f16_cuda(at::Tensor x, at::Tensor y) {
   const int64_t B = x.size(0);
   const int64_t T = x.size(1);
   const int64_t C = x.size(2);
+  TORCH_CHECK((C % 2) == 0, "add_last_layer_norm_f16 requires even C");
   auto y = at::empty({B, C}, x.options());
   auto stream = at::cuda::getCurrentCUDAStream();
+  if (C != LN_SMALL_C) {
+    add_last_layer_norm_f16_generic_kernel<LN_THREADS><<<static_cast<int>(B), LN_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(), residual.data_ptr<dtype>(), weight.data_ptr<dtype>(), bias.data_ptr<dtype>(),
+        y.data_ptr<dtype>(), B, T, static_cast<int>(C), static_cast<float>(eps));
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+    return y;
+  }
   if (B >= 1024) {
     add_last_layer_norm_f16_small_kernel<LN_SMALL512_THREADS, true, true><<<static_cast<int>(B), LN_SMALL512_THREADS, 0, stream>>>(
         x.data_ptr<dtype>(), residual.data_ptr<dtype>(), weight.data_ptr<dtype>(), bias.data_ptr<dtype>(),
     double eps) {
   auto x_out = at::empty_like(x);
   auto mixed = at::empty_like(x);
+  const int64_t C = x.size(-1);
+  TORCH_CHECK((C % 2) == 0, "add_layer_norm_cmix_mix_f16 requires even C");
+  const int64_t rows = x.numel() / C;
   auto stream = at::cuda::getCurrentCUDAStream();
+  if (C == LN_SMALL_C) {
+    add_layer_norm_cmix_mix_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        mixed.data_ptr<dtype>(),
+        rows,
+        static_cast<float>(eps));
+  } else {
+    add_layer_norm_cmix_mix_f16_generic_kernel<LN_THREADS><<<static_cast<int>(rows), LN_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        mixed.data_ptr<dtype>(),
+        rows,
+        static_cast<int>(C),
+        static_cast<float>(eps));
+  }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, mixed};
 }
     double eps) {
   auto x_out = at::empty_like(x);
   auto mixed = at::empty_like(x);
+  const int64_t C = x.size(-1);
+  TORCH_CHECK((C % 2) == 0, "add_layer_norm_cmix_mix_f16 requires even C");
+  const int64_t rows = x.numel() / C;
   auto stream = at::cuda::getCurrentCUDAStream();
+  if (C == LN_SMALL_C) {
+    add_layer_norm_cmix_mix_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        mixed.data_ptr<dtype>(),
+        rows,
+        static_cast<float>(eps));
+  } else {
+    add_layer_norm_cmix_mix_f16_generic_kernel<LN_THREADS><<<static_cast<int>(rows), LN_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        mixed.data_ptr<dtype>(),
+        rows,
+        static_cast<int>(C),
+        static_cast<float>(eps));
+  }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, mixed};
 }
   auto out_v = at::empty_like(x);
   auto out_a = at::empty_like(x);
   auto out_g = at::empty_like(x);
+  const int64_t C = x.size(-1);
+  TORCH_CHECK((C % 2) == 0, "add_layer_norm_tmix_mix6_f16 requires even C");
+  const int64_t rows = x.numel() / C;
   auto stream = at::cuda::getCurrentCUDAStream();
+  if (C == LN_SMALL_C) {
+    add_layer_norm_tmix_mix6_f16_scalar_stats_kernel<LN_SMALL_THREADS><<<static_cast<int>(rows), LN_SMALL_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_r.data_ptr<dtype>(),
+        x_w.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_v.data_ptr<dtype>(),
+        x_a.data_ptr<dtype>(),
+        x_g.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        out_r.data_ptr<dtype>(),
+        out_w.data_ptr<dtype>(),
+        out_k.data_ptr<dtype>(),
+        out_v.data_ptr<dtype>(),
+        out_a.data_ptr<dtype>(),
+        out_g.data_ptr<dtype>(),
+        rows,
+        static_cast<float>(eps));
+  } else {
+    add_layer_norm_tmix_mix6_f16_generic_kernel<LN_THREADS><<<static_cast<int>(rows), LN_THREADS, 0, stream>>>(
+        x.data_ptr<dtype>(),
+        residual.data_ptr<dtype>(),
+        shift_state.data_ptr<dtype>(),
+        weight.data_ptr<dtype>(),
+        bias.data_ptr<dtype>(),
+        x_r.data_ptr<dtype>(),
+        x_w.data_ptr<dtype>(),
+        x_k.data_ptr<dtype>(),
+        x_v.data_ptr<dtype>(),
+        x_a.data_ptr<dtype>(),
+        x_g.data_ptr<dtype>(),
+        x_out.data_ptr<dtype>(),
+        out_r.data_ptr<dtype>(),
+        out_w.data_ptr<dtype>(),
+        out_k.data_ptr<dtype>(),
+        out_v.data_ptr<dtype>(),
+        out_a.data_ptr<dtype>(),
+        out_g.data_ptr<dtype>(),
+        rows,
+        static_cast<int>(C),
+        static_cast<float>(eps));
+  }
   C10_CUDA_KERNEL_LAUNCH_CHECK();
   return {x_out, out_r, out_w, out_k, out_v, out_a, out_g};
 }

rwkv7_fast_v3a.py CHANGED Viewed

@@ -25,7 +25,7 @@ ORIG_LINEAR_GROUPS = {"att_c2c", "ffn_key", "head"}
 LOWRANK_SUFFIXES = ("att.w1", "att.w2", "att.a1", "att.a2", "att.g1", "att.g2", "att.v1", "att.v2")
 LOWRANK_IN_ROWS_T = 7
 LOWRANK_OUT_ROWS_T = 4
-CMIX_NOFC_MAX_ROWS = 19
 CMIX_NOFC_ROW20_MAX_T = 5
 CMIX_NOFC_T512_MIN_ROWS = 8
 LN1_TMIX_FUSE = True
@@ -36,8 +36,9 @@ CMIX_ROWS2_NOFC = "rows2_nofc"
 CMIX_DENSE = "dense"
 def main() -> None:
-    global WKV_MODE, EMB_DEVICE, RKV_MODE, CMIX_SPARSE, LOWRANK_WEIGHT, ORIG_LINEAR_GROUPS
     parser = argparse.ArgumentParser()
     parser.add_argument("--warmup", type=int, default=1)
     parser.add_argument("--iters", type=int, default=3)
     parser.add_argument("--cases", default="1x1,1x2,1x4,1x8,1x16,1x32,1x64,1x128,1x256,2x1,4x1,8x1,16x1,32x1,64x1,128x1,256x1,2x2,4x4,8x8,16x16") # try 1x1024 1024x1 32x32 for extreme tps
@@ -54,6 +55,7 @@ def main() -> None:
     parser.add_argument("--orig-linear-groups", default="att_c2c,ffn_key,head") # comma list: none, att_c2c, ffn_key, head
     args = parser.parse_args()
     WKV_MODE = args.wkv
     EMB_DEVICE = args.emb
     RKV_MODE = args.batched_rkv
@@ -62,7 +64,7 @@ def main() -> None:
     ORIG_LINEAR_GROUPS = parse_orig_linear_groups(args.orig_linear_groups)
     groups = ",".join(sorted(ORIG_LINEAR_GROUPS)) if ORIG_LINEAR_GROUPS else "none"
     log(f"start model={MODEL_PATH} wkv={WKV_MODE} emb={EMB_DEVICE} batched_rkv={RKV_MODE} cmix_sparse={CMIX_SPARSE} lowrank_weight={LOWRANK_WEIGHT} orig_linear_groups={groups}")
-    log(f"fixed fast path: ln=v3a linear=v3a/splitk lowrank={LOWRANK_IN_ROWS_T}/{LOWRANK_OUT_ROWS_T} nofc_rows<={CMIX_NOFC_MAX_ROWS} row20_t<={CMIX_NOFC_ROW20_MAX_T} nofc_t512_rows>={CMIX_NOFC_T512_MIN_ROWS}")
     load_extensions(WKV_MODE)
     model = RWKV7()
     if args.eval_json:
@@ -97,7 +99,7 @@ def select_path(B: int, T: int) -> PathConfig:
     if CMIX_SPARSE == "off":
         cmix_mode = CMIX_DENSE
     elif CMIX_SPARSE == "no-fc":
-        use_nofc = rows <= CMIX_NOFC_MAX_ROWS or (rows == 20 and T <= CMIX_NOFC_ROW20_MAX_T)
         cmix_mode = CMIX_B1T1_NOFC if rows == 1 else (CMIX_ROWS2_NOFC if use_nofc else CMIX_DENSE)
     elif rows == 1:
         cmix_mode = CMIX_B1T1_SPARSE
@@ -115,6 +117,12 @@ def select_path(B: int, T: int) -> PathConfig:
         use_batched_rkv = False
     return PathConfig(rows=rows, use_batched_rkv=use_batched_rkv, cmix_mode=cmix_mode)
 def parse_orig_linear_groups(text: str) -> set[str]:
     groups = {x.strip() for x in text.replace(",", " ").split() if x.strip()}
     if not groups or groups == {"none"}:
@@ -130,6 +138,12 @@ def use_orig_linear(group: str) -> bool:
 def is_lowrank_weight(key: str) -> bool:
     return key.endswith(LOWRANK_SUFFIXES)
 def is_att_c2c_weight(key: str) -> bool:
     return ".att." in key and key.endswith(("receptance.weight", "key.weight", "value.weight", "output.weight"))
@@ -173,6 +187,7 @@ class RWKV7:
         C, V = H * N, z["emb.weight"].shape[0]
         assert N == HEAD_SIZE
         log(f"detected model C={C} H={H} N={N} V={V}")
         emb_src = z["emb.weight"].squeeze()
         ln0_w_src = z["blocks.0.ln0.weight"].squeeze()
@@ -271,7 +286,7 @@ class RWKV7:
         dev.copy_(host.view(B,T,C), non_blocking=True)
         return dev
-    def forward_from_x(self, x: torch.Tensor, state: list[torch.Tensor], path: PathConfig, all_logits: bool = False) -> torch.Tensor:
         z = self.z
         B, T, _ = x.shape
         v_first = x
@@ -302,7 +317,11 @@ class RWKV7:
                 else:
                     x, xx = self.add_ln(x, xx, z[p_next+"ln1.weight"], z[p_next+"ln1.bias"])
             elif not all_logits:
-                x = self.add_last_ln(x, xx, z["ln_out.weight"], z["ln_out.bias"])
                 torch.ops.rwkv7_v3a_ops.advance_i32(state[2], T) # !!! IMPORTANT FOR WKV16 DITHERING !!!
                 return self.linear_head(x)
             else:
@@ -323,6 +342,14 @@ class RWKV7:
         x = self.embed(tokens)
         return self.forward_from_x(x, state, path, all_logits=True)
     def tmix(self, layer: int, x: torch.Tensor, shift_state: torch.Tensor, wkv_state: torch.Tensor, elapsed_t: torch.Tensor, v_first: torch.Tensor, p: str, path: PathConfig, pre_mix=None) -> tuple[torch.Tensor, torch.Tensor]:
         z = self.z
         ops = torch.ops.rwkv7_fast_ops_fp16
@@ -351,11 +378,11 @@ class RWKV7:
                 v = self.linear_orig_layout(xv, z[p+"value.weight"], path, "att_c2c")
         v1 = None
-        if LOWRANK_WEIGHT != "orig" and path.rows <= LOWRANK_IN_ROWS_T and path.rows <= LOWRANK_OUT_ROWS_T and layer != 0:
             w1, a1, g1, v1 = torch.ops.rwkv7_v3a_ops.linear_wagv_rank_in_f16(
                 xw.contiguous(), xa.contiguous(), xg.contiguous(), xv.contiguous(),
                 z[p+"w1.t"], z[p+"a1.t"], z[p+"g1.t"], z[p+"v1.t"])
-        elif LOWRANK_WEIGHT != "orig" and path.rows <= LOWRANK_IN_ROWS_T:
             w1, a1, g1 = torch.ops.rwkv7_v3a_ops.linear_wag_rank_in_f16(
                 xw.contiguous(), xa.contiguous(), xg.contiguous(), z[p+"w1.t"], z[p+"a1.t"], z[p+"g1.t"])
         else:
@@ -363,13 +390,13 @@ class RWKV7:
             a1 = self.linear_rank_in(xa, z.get(p+"a1"), z.get(p+"a1.t"), path.rows)
             g1 = self.linear_rank_in(xg, z.get(p+"g1"), z.get(p+"g1.t"), path.rows)
         v_done = False
-        if LOWRANK_WEIGHT != "orig" and path.rows <= LOWRANK_OUT_ROWS_T and layer != 0 and v1 is not None:
             w, a, g, v = torch.ops.rwkv7_v3a_ops.linear_wagv_rank_out_f16(
                 w1.contiguous(), a1.contiguous(), g1.contiguous(), v1.contiguous(),
                 z[p+"w2.t"], z[p+"a2.t"], z[p+"g2.t"], z[p+"v2.t"],
                 v.contiguous(), v_first.contiguous(), z[p+"v0"])
             v_done = True
-        elif LOWRANK_WEIGHT != "orig" and path.rows <= LOWRANK_OUT_ROWS_T:
             w, a, g = torch.ops.rwkv7_v3a_ops.linear_wag_rank_out_f16(
                 w1.contiguous(), a1.contiguous(), g1.contiguous(), z[p+"w2.t"], z[p+"a2.t"], z[p+"g2.t"])
         else:
@@ -381,7 +408,7 @@ class RWKV7:
         if layer == 0:
             v_first = v
         elif not v_done:
-            if LOWRANK_WEIGHT != "orig" and path.rows <= LOWRANK_OUT_ROWS_T:
                 if v1 is None:
                     v1 = self.linear_rank_in(xv, z.get(p+"v1"), z.get(p+"v1.t"), path.rows)
                 v = torch.ops.rwkv7_v3a_ops.linear_t_vres_f16(v1.contiguous(), z[p+"v2.t"], v.contiguous(), v_first.contiguous(), z[p+"v0"])
@@ -447,28 +474,102 @@ class RWKV7:
             return self.linear(x, weight)
         if path.rows == 1:
             if group == "ffn_key":
-                return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, False)
-            return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, True)
         if path.rows == 2:
             if group == "att_c2c":
                 return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 64, 2, True)
             if group == "ffn_key":
-                return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 256, 1, True)
             return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 64, 2, True)
         if path.rows == 3:
             if group == "head":
                 return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 3, 2)
             if group == "ffn_key":
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if group == "att_c2c":
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
             return torch.ops.rwkv7_v3a_ops.linear_orig_rows_cfg_f16(x.contiguous(), weight, 64, 3, 4)
         if path.rows == 4:
             if group == "ffn_key":
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if group == "att_c2c":
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
         if group == "head":
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 0)
             if path.rows >= 512:
@@ -492,6 +593,40 @@ class RWKV7:
             if path.rows >= 72:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 2)
         if group == "att_c2c":
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 4)
             if path.rows >= 768:
@@ -523,6 +658,39 @@ class RWKV7:
             if path.rows >= 5:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
         if group == "ffn_key":
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if path.rows >= 768:
@@ -559,12 +727,12 @@ class RWKV7:
         return self.linear_lowrank_orig(x, weight) if weight is not None else self.linear_t_orig(x, weight_t)
     def linear_rank_out(self, x: torch.Tensor, weight: torch.Tensor, weight_t: torch.Tensor, rows: int) -> torch.Tensor:
-        if weight_t is not None and rows <= LOWRANK_OUT_ROWS_T:
             return torch.ops.rwkv7_v3a_ops.linear_t_f16(x.contiguous(), weight_t)
         return self.linear_lowrank_orig(x, weight) if weight is not None else self.linear_t_orig(x, weight_t)
     def linear_rank_out_act(self, x: torch.Tensor, weight: torch.Tensor, weight_t: torch.Tensor, rows: int, act: int) -> torch.Tensor:
-        if weight_t is not None and rows <= LOWRANK_OUT_ROWS_T:
             return torch.ops.rwkv7_v3a_ops.linear_t_act_f16(x.contiguous(), weight_t, act)
         ops = torch.ops.rwkv7_fast_ops_fp16
         x = ops.act_tanh(x.contiguous()) if act == 1 else ops.act_sigmoid(x.contiguous())

 LOWRANK_SUFFIXES = ("att.w1", "att.w2", "att.a1", "att.a2", "att.g1", "att.g2", "att.v1", "att.v2")
 LOWRANK_IN_ROWS_T = 7
 LOWRANK_OUT_ROWS_T = 4
+LOWRANK_FUSED_MIN_C = 1024
 CMIX_NOFC_ROW20_MAX_T = 5
 CMIX_NOFC_T512_MIN_ROWS = 8
 LN1_TMIX_FUSE = True
 CMIX_DENSE = "dense"
 def main() -> None:
+    global MODEL_PATH, WKV_MODE, EMB_DEVICE, RKV_MODE, CMIX_SPARSE, LOWRANK_WEIGHT, ORIG_LINEAR_GROUPS
     parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default=MODEL_PATH)
     parser.add_argument("--warmup", type=int, default=1)
     parser.add_argument("--iters", type=int, default=3)
     parser.add_argument("--cases", default="1x1,1x2,1x4,1x8,1x16,1x32,1x64,1x128,1x256,2x1,4x1,8x1,16x1,32x1,64x1,128x1,256x1,2x2,4x4,8x8,16x16") # try 1x1024 1024x1 32x32 for extreme tps
     parser.add_argument("--orig-linear-groups", default="att_c2c,ffn_key,head") # comma list: none, att_c2c, ffn_key, head
     args = parser.parse_args()
+    MODEL_PATH = args.model
     WKV_MODE = args.wkv
     EMB_DEVICE = args.emb
     RKV_MODE = args.batched_rkv
     ORIG_LINEAR_GROUPS = parse_orig_linear_groups(args.orig_linear_groups)
     groups = ",".join(sorted(ORIG_LINEAR_GROUPS)) if ORIG_LINEAR_GROUPS else "none"
     log(f"start model={MODEL_PATH} wkv={WKV_MODE} emb={EMB_DEVICE} batched_rkv={RKV_MODE} cmix_sparse={CMIX_SPARSE} lowrank_weight={LOWRANK_WEIGHT} orig_linear_groups={groups}")
+    log(f"fixed fast path: ln=v3a linear=v3a/splitk lowrank={LOWRANK_IN_ROWS_T}/{LOWRANK_OUT_ROWS_T} nofc_rows=by_C row20_t=by_C nofc_t512_rows>={CMIX_NOFC_T512_MIN_ROWS}")
     load_extensions(WKV_MODE)
     model = RWKV7()
     if args.eval_json:
     if CMIX_SPARSE == "off":
         cmix_mode = CMIX_DENSE
     elif CMIX_SPARSE == "no-fc":
+        use_nofc = rows <= cmix_nofc_max_rows() or (rows == 20 and T <= cmix_nofc_row20_max_t())
         cmix_mode = CMIX_B1T1_NOFC if rows == 1 else (CMIX_ROWS2_NOFC if use_nofc else CMIX_DENSE)
     elif rows == 1:
         cmix_mode = CMIX_B1T1_SPARSE
         use_batched_rkv = False
     return PathConfig(rows=rows, use_batched_rkv=use_batched_rkv, cmix_mode=cmix_mode)
+def cmix_nofc_max_rows() -> int:
+    return 19
+def cmix_nofc_row20_max_t() -> int:
+    return CMIX_NOFC_ROW20_MAX_T
 def parse_orig_linear_groups(text: str) -> set[str]:
     groups = {x.strip() for x in text.replace(",", " ").split() if x.strip()}
     if not groups or groups == {"none"}:
 def is_lowrank_weight(key: str) -> bool:
     return key.endswith(LOWRANK_SUFFIXES)
+def can_use_lowrank_fused(rows: int) -> bool:
+    return C >= LOWRANK_FUSED_MIN_C and rows <= LOWRANK_IN_ROWS_T
+def can_use_lowrank_out_fused(rows: int) -> bool:
+    return C >= LOWRANK_FUSED_MIN_C and rows <= LOWRANK_OUT_ROWS_T
 def is_att_c2c_weight(key: str) -> bool:
     return ".att." in key and key.endswith(("receptance.weight", "key.weight", "value.weight", "output.weight"))
         C, V = H * N, z["emb.weight"].shape[0]
         assert N == HEAD_SIZE
         log(f"detected model C={C} H={H} N={N} V={V}")
+        log(f"cmix no-fc path: rows<={cmix_nofc_max_rows()} row20_t<={cmix_nofc_row20_max_t()}")
         emb_src = z["emb.weight"].squeeze()
         ln0_w_src = z["blocks.0.ln0.weight"].squeeze()
         dev.copy_(host.view(B,T,C), non_blocking=True)
         return dev
+    def forward_from_x(self, x: torch.Tensor, state: list[torch.Tensor], path: PathConfig, all_logits: bool = False, last_indices=None) -> torch.Tensor:
         z = self.z
         B, T, _ = x.shape
         v_first = x
                 else:
                     x, xx = self.add_ln(x, xx, z[p_next+"ln1.weight"], z[p_next+"ln1.bias"])
             elif not all_logits:
+                if last_indices is not None:
+                    x = self.ln(self.add(x, xx), z["ln_out.weight"], z["ln_out.bias"])
+                    x = x[torch.arange(B, device=x.device), last_indices].contiguous()
+                else:
+                    x = self.add_last_ln(x, xx, z["ln_out.weight"], z["ln_out.bias"])
                 torch.ops.rwkv7_v3a_ops.advance_i32(state[2], T) # !!! IMPORTANT FOR WKV16 DITHERING !!!
                 return self.linear_head(x)
             else:
         x = self.embed(tokens)
         return self.forward_from_x(x, state, path, all_logits=True)
+    def forward_last_at(self, tokens: torch.Tensor, state: list[torch.Tensor], last_indices: torch.Tensor) -> torch.Tensor:
+        if tokens.dim() == 1:
+            tokens = tokens.unsqueeze(0)
+        B, T = tokens.shape
+        path = select_path(B, T)
+        x = self.embed(tokens)
+        return self.forward_from_x(x, state, path, last_indices=last_indices)
     def tmix(self, layer: int, x: torch.Tensor, shift_state: torch.Tensor, wkv_state: torch.Tensor, elapsed_t: torch.Tensor, v_first: torch.Tensor, p: str, path: PathConfig, pre_mix=None) -> tuple[torch.Tensor, torch.Tensor]:
         z = self.z
         ops = torch.ops.rwkv7_fast_ops_fp16
                 v = self.linear_orig_layout(xv, z[p+"value.weight"], path, "att_c2c")
         v1 = None
+        if LOWRANK_WEIGHT != "orig" and can_use_lowrank_fused(path.rows) and can_use_lowrank_out_fused(path.rows) and layer != 0:
             w1, a1, g1, v1 = torch.ops.rwkv7_v3a_ops.linear_wagv_rank_in_f16(
                 xw.contiguous(), xa.contiguous(), xg.contiguous(), xv.contiguous(),
                 z[p+"w1.t"], z[p+"a1.t"], z[p+"g1.t"], z[p+"v1.t"])
+        elif LOWRANK_WEIGHT != "orig" and can_use_lowrank_fused(path.rows):
             w1, a1, g1 = torch.ops.rwkv7_v3a_ops.linear_wag_rank_in_f16(
                 xw.contiguous(), xa.contiguous(), xg.contiguous(), z[p+"w1.t"], z[p+"a1.t"], z[p+"g1.t"])
         else:
             a1 = self.linear_rank_in(xa, z.get(p+"a1"), z.get(p+"a1.t"), path.rows)
             g1 = self.linear_rank_in(xg, z.get(p+"g1"), z.get(p+"g1.t"), path.rows)
         v_done = False
+        if LOWRANK_WEIGHT != "orig" and can_use_lowrank_out_fused(path.rows) and layer != 0 and v1 is not None:
             w, a, g, v = torch.ops.rwkv7_v3a_ops.linear_wagv_rank_out_f16(
                 w1.contiguous(), a1.contiguous(), g1.contiguous(), v1.contiguous(),
                 z[p+"w2.t"], z[p+"a2.t"], z[p+"g2.t"], z[p+"v2.t"],
                 v.contiguous(), v_first.contiguous(), z[p+"v0"])
             v_done = True
+        elif LOWRANK_WEIGHT != "orig" and can_use_lowrank_out_fused(path.rows):
             w, a, g = torch.ops.rwkv7_v3a_ops.linear_wag_rank_out_f16(
                 w1.contiguous(), a1.contiguous(), g1.contiguous(), z[p+"w2.t"], z[p+"a2.t"], z[p+"g2.t"])
         else:
         if layer == 0:
             v_first = v
         elif not v_done:
+            if LOWRANK_WEIGHT != "orig" and can_use_lowrank_out_fused(path.rows):
                 if v1 is None:
                     v1 = self.linear_rank_in(xv, z.get(p+"v1"), z.get(p+"v1.t"), path.rows)
                 v = torch.ops.rwkv7_v3a_ops.linear_t_vres_f16(v1.contiguous(), z[p+"v2.t"], v.contiguous(), v_first.contiguous(), z[p+"v0"])
             return self.linear(x, weight)
         if path.rows == 1:
             if group == "ffn_key":
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, True)
+                return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, C <= 1024)
+            return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, group != "att_c2c" or C < 2048)
         if path.rows == 2:
             if group == "att_c2c":
                 return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 64, 2, True)
             if group == "ffn_key":
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, False)
+                if C < 4096:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 64, 2, True)
+                return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, False)
+            if group == "head" and C == 2560:
+                return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 128, 2, False)
             return torch.ops.rwkv7_v3a_ops.linear_orig_rows_exact_f16(x.contiguous(), weight, 64, 2, True)
         if path.rows == 3:
             if group == "head":
+                if C <= 2048:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
                 return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 3, 2)
             if group == "ffn_key":
+                if C <= 1024:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_cfg_f16(x.contiguous(), weight, 64, 3, 4)
+                if C == 2048:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if group == "att_c2c":
+                if C == 768:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 1, 2)
+                if C == 1024:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 2, 2)
+                if C == 2048:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 3, 4)
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 3, 2)
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
             return torch.ops.rwkv7_v3a_ops.linear_orig_rows_cfg_f16(x.contiguous(), weight, 64, 3, 4)
         if path.rows == 4:
             if group == "ffn_key":
+                if C <= 1024:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_cfg_f16(x.contiguous(), weight, 64, 2, 4)
+                if C == 2048:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if group == "att_c2c":
+                if C <= 1024:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 2, 2)
+                if C == 2048:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 4, 2)
+                if C == 2560:
+                    return torch.ops.rwkv7_v3a_ops.linear_orig_rows_f16(x.contiguous(), weight, 4, 2)
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
         if group == "head":
+            if C == 768:
+                if 192 <= path.rows < 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 3)
+                if 96 <= path.rows < 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 1)
+            if C == 1024:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+                if 192 <= path.rows < 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
+                if 96 <= path.rows < 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 1)
+            if C == 2048:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 0)
+                if 192 <= path.rows < 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 6)
+                if 128 <= path.rows < 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 1)
+                if 96 <= path.rows < 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
+            if C == 2560:
+                if path.rows >= 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 0)
+                if path.rows >= 192:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 5)
+                if path.rows >= 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 5)
+                if path.rows >= 128:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 1)
+                if path.rows >= 96:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 0)
+                if path.rows >= 80:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
+                if path.rows >= 72:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 1)
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 0)
             if path.rows >= 512:
             if path.rows >= 72:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 2)
         if group == "att_c2c":
+            if C == 2560 and 17 <= path.rows <= 20:
+                return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
+            if C == 768:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 1)
+                if 96 <= path.rows < 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 3)
+            if C == 1024:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 0)
+                if 96 <= path.rows < 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 6)
+            if C == 2048:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 3)
+                if 192 <= path.rows < 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 0)
+                if 96 <= path.rows < 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 4)
+            if C == 2560:
+                if path.rows >= 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 1)
+                if path.rows >= 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
+                if path.rows >= 128:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 2)
+                if path.rows >= 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 3)
+                if path.rows >= 96:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 2)
+                if path.rows >= 72:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 2)
+                if path.rows >= 5:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 4)
             if path.rows >= 768:
             if path.rows >= 5:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 2)
         if group == "ffn_key":
+            if C == 2560 and 17 <= path.rows <= 20:
+                return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
+            if C == 768:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+                if 96 <= path.rows < 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
+            if C == 1024:
+                if 256 <= path.rows < 384:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 2)
+                if 192 <= path.rows < 256:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
+                if 96 <= path.rows < 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 2)
+            if C == 2048 and 128 <= path.rows < 160:
+                return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 3)
+            if C == 2560:
+                if path.rows >= 192:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 5)
+                if path.rows >= 160:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 4)
+                if path.rows >= 128:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 5)
+                if path.rows >= 112:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 4)
+                if path.rows >= 96:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 128, 4)
+                if path.rows >= 80:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 3)
+                if path.rows >= 72:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 32, 4)
+                if path.rows >= 3:
+                    return torch.ops.rwkv7_v3a_ops.linear_f16_orig(x.contiguous(), weight)
             if path.rows >= 1024:
                 return torch.ops.rwkv7_v3a_ops.linear_f16_orig_lt_cfg(x.contiguous(), weight, 0, 0)
             if path.rows >= 768:
         return self.linear_lowrank_orig(x, weight) if weight is not None else self.linear_t_orig(x, weight_t)
     def linear_rank_out(self, x: torch.Tensor, weight: torch.Tensor, weight_t: torch.Tensor, rows: int) -> torch.Tensor:
+        if weight_t is not None and C >= LOWRANK_FUSED_MIN_C and rows <= LOWRANK_OUT_ROWS_T:
             return torch.ops.rwkv7_v3a_ops.linear_t_f16(x.contiguous(), weight_t)
         return self.linear_lowrank_orig(x, weight) if weight is not None else self.linear_t_orig(x, weight_t)
     def linear_rank_out_act(self, x: torch.Tensor, weight: torch.Tensor, weight_t: torch.Tensor, rows: int, act: int) -> torch.Tensor:
+        if weight_t is not None and C >= LOWRANK_FUSED_MIN_C and rows <= LOWRANK_OUT_ROWS_T:
             return torch.ops.rwkv7_v3a_ops.linear_t_act_f16(x.contiguous(), weight_t, act)
         ops = torch.ops.rwkv7_fast_ops_fp16
         x = ops.act_tanh(x.contiguous()) if act == 1 else ops.act_sigmoid(x.contiguous())