Spaces:

WeReCooking2
/

magenta-rt-piano-cpu

Sleeping

App Files Files Community

Nekochu commited on 13 days ago

Commit

69afa51

1 Parent(s): ec30eba

Add model size dropdown (small/large), lazy-load LLM sessions, infer KV geometry from ONNX shapes

Browse files

Files changed (1) hide show

app.py +85 -45

app.py CHANGED Viewed

@@ -22,10 +22,6 @@ COND_OFFSET = 7            # NUM_RESERVED + 1, added to every conditioning integ
 COND_LEN = 144             # 12 style + 128 notes + 1 drum + 3 cfg
 VOCAB_SIZE = NUM_RESERVED + NUM_CB * CODEBOOK  # 12294
-# mrt2_small KV geometry
-T_LAYERS, T_W, T_H, T_HD = 12, 41, 8, 128   # temporal
-D_LAYERS, D_W, D_H, D_HD = 2, 12, 6, 128    # depth
 # note states
 NOTE_MASKED, NOTE_OFF, NOTE_ON = -1, 0, 3
 DRUM_MASKED = -1
@@ -117,25 +113,72 @@ def _sess(path: str) -> ort.InferenceSession:
     opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     return ort.InferenceSession(path, opts, providers=["CPUExecutionProvider"])
 text_enc_s = _sess(f"{MODEL_PATH}/musiccoca/text_encoder.onnx")
 mapper_s   = _sess(f"{MODEL_PATH}/musiccoca/mapper.onnx")
 vq_s       = _sess(f"{MODEL_PATH}/musiccoca/pretrained_vector_quantizer.onnx")
-enc_s      = _sess(f"{MODEL_PATH}/mrt2_small/onnx/encoder.onnx")
-temp_s     = _sess(f"{MODEL_PATH}/mrt2_small/onnx/temporal_step.onnx")
-depth_s    = _sess(f"{MODEL_PATH}/mrt2_small/onnx/depth_step.onnx")
-embed_s    = _sess(f"{MODEL_PATH}/mrt2_small/onnx/embed.onnx")
 dec_s      = _sess(f"{MODEL_PATH}/spectrostream/decoder.onnx")
 sp         = spm_lib.SentencePieceProcessor(model_file=f"{MODEL_PATH}/musiccoca/spm.model")
-# Log I/O names at startup for debugging
-for name, s in [("text_enc", text_enc_s), ("mapper", mapper_s), ("vq", vq_s),
-                ("enc", enc_s), ("temporal", temp_s), ("depth", depth_s),
-                ("embed", embed_s), ("decoder", dec_s)]:
     ins  = {i.name: i.shape for i in s.get_inputs()}
     outs = {o.name: o.shape for o in s.get_outputs()}
     print(f"[{name}] inputs: {ins}")
     print(f"[{name}] outputs: {outs}")
 # ---------- helper: discretize CFG ----------
 def _disc_cfg(v: float, step: float, max_bin: int) -> int:
     c = max(-1.0, min(7.0, v))
@@ -143,7 +186,7 @@ def _disc_cfg(v: float, step: float, max_bin: int) -> int:
 # ---------- conditioning vector ----------
 def build_cond(style_tokens: list, notes: list, cfg_mcc=CFG_MCC, cfg_notes=CFG_NOTES, cfg_drums=CFG_DRUMS) -> np.ndarray:
-    """Build 144-length cond vector, shifted by COND_OFFSET, shape [1,1,144] int32."""
     out = [0] * COND_LEN
     k = 0
     for i in range(NUM_CB):
@@ -169,7 +212,6 @@ def encode_text(prompt: str) -> list:
     pad_mask = np.ones((1, 128), dtype=np.float32)
     pad_mask[0, :len(ids_raw)+1] = 0.0
-    # text encoder - feed by position (name detection fallback)
     enc_inputs = text_enc_s.get_inputs()
     feed_enc = {}
     for inp in enc_inputs:
@@ -179,7 +221,6 @@ def encode_text(prompt: str) -> list:
             feed_enc[inp.name] = ids
     emb = text_enc_s.run(None, feed_enc)[0]  # [1, 768]
-    # mapper - feed by position (args_0=emb, args_1=noise)
     map_inputs = mapper_s.get_inputs()
     feed_map = {}
     for inp in map_inputs:
@@ -189,13 +230,11 @@ def encode_text(prompt: str) -> list:
             feed_map[inp.name] = emb
     mapped = mapper_s.run(None, feed_map)[0]  # [1, 768]
-    # L2 normalize
     norm = np.linalg.norm(mapped)
     if norm > 1e-8:
         mapped = mapped / norm
-    # VQ quantizer
-    style_tokens = vq_s.run(None, {vq_s.get_inputs()[0].name: mapped})[0]  # [1, 12]
     return style_tokens.reshape(-1).tolist()
 # ---------- sampling ----------
@@ -225,6 +264,7 @@ def generate(
     n_seconds: float,
     temperature: float,
     cfg_mcc: float,
     progress=gr.Progress(track_tqdm=True),
 ) -> tuple:
     import json
@@ -236,17 +276,21 @@ def generate(
     n_frames = max(4, int(n_seconds * 25))
-    # Style tokens from text
-    progress(0.0, desc="Encoding prompt...")
     style_tokens = encode_text(prompt)
-    # Conditioning vector
     cond = build_cond(style_tokens, held_notes, cfg_mcc=cfg_mcc)
-    # Encode conditioning -> enc_out [1,1,256]
     enc_out_arr = enc_s.run(None, {enc_s.get_inputs()[0].name: cond})[0]
-    # Init temporal KV cache (zeros)
     psk = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
     psv = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
     pck = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
@@ -254,16 +298,14 @@ def generate(
     prev_codes = np.zeros((1, NUM_CB), dtype=np.int64)
     cache_pos = 0
-    # Cache output name lists (same every frame)
     t_out_names = [o.name for o in temp_s.get_outputs()]
     d_out_names = [o.name for o in depth_s.get_outputs()]
     all_codec_frames = []
     for f in range(n_frames):
-        progress((f + 1) / (n_frames + 1), desc=f"Generating frame {f+1}/{n_frames} (this takes a while on CPU...)")
-        # Temporal step
         feed_t = {
             "prev_codes": prev_codes,
             "enc_out":    enc_out_arr,
@@ -276,17 +318,16 @@ def generate(
             feed_t[f"past_cross_v.{i}"] = pcv[i]
         t_out_dict = dict(zip(t_out_names, temp_s.run(None, feed_t)))
-        temporal_out = t_out_dict["temporal_out"]  # [1,1,1024]
         for i in range(T_LAYERS):
             psk[i] = t_out_dict[f"present_self_k.{i}"]
             psv[i] = t_out_dict[f"present_self_v.{i}"]
             pck[i] = t_out_dict[f"present_cross_k.{i}"]
             pcv[i] = t_out_dict[f"present_cross_v.{i}"]
-        # Depth loop: generate 12 unique-scheme tokens per frame
         dk = [np.zeros((1, D_W, D_H, D_HD), np.float32) for _ in range(D_LAYERS)]
         dv = [np.zeros((1, D_W, D_H, D_HD), np.float32) for _ in range(D_LAYERS)]
-        depth_in = temporal_out  # [1,1,1024]
         unique_codes = []
         for level in range(NUM_CB):
@@ -299,7 +340,7 @@ def generate(
                 feed_d[f"past_v.{i}"] = dv[i]
             d_out_dict = dict(zip(d_out_names, depth_s.run(None, feed_d)))
-            logits = d_out_dict["logits"]  # [1, VOCAB_SIZE]
             for i in range(D_LAYERS):
                 dk[i] = d_out_dict[f"present_k.{i}"]
                 dv[i] = d_out_dict[f"present_v.{i}"]
@@ -311,7 +352,7 @@ def generate(
             if level < NUM_CB - 1:
                 e_out = embed_s.run(None, {"token": np.array([token], dtype=np.int64)})
-                depth_in = e_out[0]  # [1,1,1024]
         codec_frame = to_codec(unique_codes)
         all_codec_frames.append(codec_frame)
@@ -321,13 +362,11 @@ def generate(
     if len(all_codec_frames) < 2:
         return (SAMPLE_RATE, np.zeros((FRAME_SAMPLES * 2, 2), dtype=np.float32))
-    # SpectroStream batch decode: pass all T frames, get (T-1)*1920 stereo samples
     progress(0.98, desc="Decoding audio...")
     codes_arr = np.array(all_codec_frames, dtype=np.int32).reshape(1, len(all_codec_frames), NUM_CB)
     audio_raw = dec_s.run(None, {"codes": codes_arr})[0]  # [1, (T-1)*1920, 2]
-    audio = audio_raw.squeeze(0)  # [(T-1)*1920, 2]
-    # Clamp
     audio = np.clip(audio, -1.0, 1.0).astype(np.float32)
     progress(1.0, desc="Done!")
     return (SAMPLE_RATE, audio)
@@ -398,8 +437,6 @@ PIANO_HTML = """
     {midi:79,n:'G5'},{midi:81,n:'A5'},{midi:83,n:'B5'},
     {midi:84,n:'C6'}
   ];
-  // Each entry: MIDI -> [white-key-index-of-left-neighbor, 0]
-  // C#/Db is right of C (index 0,7,14), D# right of D (1,8,15), etc.
   const BLACK_POSITIONS = {
     49:[0,0], 51:[1,0], 54:[3,0], 56:[4,0], 58:[5,0],
     61:[7,0], 63:[8,0], 66:[10,0], 68:[11,0], 70:[12,0],
@@ -410,7 +447,6 @@ PIANO_HTML = """
   let held = new Set();
   const piano = document.getElementById('piano');
-  // Draw white keys
   WHITE_NOTES.forEach((wk, idx) => {
     const el = document.createElement('div');
     el.className = 'white-key';
@@ -420,12 +456,9 @@ PIANO_HTML = """
     piano.appendChild(el);
   });
-  // Draw black keys
   const whiteKeys = piano.querySelectorAll('.white-key');
   Object.entries(BLACK_POSITIONS).forEach(([midi, [wIdx, _]]) => {
     if (wIdx >= whiteKeys.length) return;
-    const ref = whiteKeys[wIdx].getBoundingClientRect
-      ? whiteKeys[wIdx] : null;
     const el = document.createElement('div');
     el.className = 'black-key';
     el.dataset.midi = midi;
@@ -476,10 +509,10 @@ PIANO_HTML = """
 </script>
 """
-def _generate_wrapper(prompt, notes_json, n_seconds, temperature, cfg_mcc, progress=gr.Progress()):
     if not prompt.strip():
         prompt = "smooth jazz piano"
-    return generate(prompt, notes_json, n_seconds, temperature, cfg_mcc, progress)
 with gr.Blocks(title="Magenta RT2 - Piano (CPU)") as demo:
     gr.HTML("""
@@ -510,6 +543,12 @@ with gr.Blocks(title="Magenta RT2 - Piano (CPU)") as demo:
                 value="smooth jazz piano, warm, relaxed",
                 lines=2
             )
             n_seconds = gr.Slider(1, 20, value=5, step=1, label="Duration (seconds)")
             temperature = gr.Slider(0.1, 1.5, value=0.9, step=0.05, label="Temperature (creativity)")
             cfg_mcc = gr.Slider(0.0, 6.0, value=1.6, step=0.1, label="Style guidance strength")
@@ -522,13 +561,14 @@ with gr.Blocks(title="Magenta RT2 - Piano (CPU)") as demo:
       <b style="color:#aaa;">How to use:</b>
       Click piano keys to hold notes (click again to release) - they steer the melody.
       Type a style prompt, set duration, then hit Generate.
-      <br>CPU generation: ~1-3 min for 5s audio. No MIDI device needed.
     </div>
     """)
     gen_btn.click(
         fn=_generate_wrapper,
-        inputs=[prompt_in, notes_state, n_seconds, temperature, cfg_mcc],
         outputs=[audio_out],
     )

 COND_LEN = 144             # 12 style + 128 notes + 1 drum + 3 cfg
 VOCAB_SIZE = NUM_RESERVED + NUM_CB * CODEBOOK  # 12294
 # note states
 NOTE_MASKED, NOTE_OFF, NOTE_ON = -1, 0, 3
 DRUM_MASKED = -1
     opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
     return ort.InferenceSession(path, opts, providers=["CPUExecutionProvider"])
+# Shared sessions (same for both model sizes)
 text_enc_s = _sess(f"{MODEL_PATH}/musiccoca/text_encoder.onnx")
 mapper_s   = _sess(f"{MODEL_PATH}/musiccoca/mapper.onnx")
 vq_s       = _sess(f"{MODEL_PATH}/musiccoca/pretrained_vector_quantizer.onnx")
 dec_s      = _sess(f"{MODEL_PATH}/spectrostream/decoder.onnx")
 sp         = spm_lib.SentencePieceProcessor(model_file=f"{MODEL_PATH}/musiccoca/spm.model")
+for name, s in [("text_enc", text_enc_s), ("mapper", mapper_s), ("vq", vq_s), ("decoder", dec_s)]:
     ins  = {i.name: i.shape for i in s.get_inputs()}
     outs = {o.name: o.shape for o in s.get_outputs()}
     print(f"[{name}] inputs: {ins}")
     print(f"[{name}] outputs: {outs}")
+# ---------- LLM variant loading (lazy, cached) ----------
+_llm_cache: dict = {}
+def _infer_geometry(temp_sess, depth_sess) -> tuple:
+    """Read KV cache geometry from ONNX input shapes - works for any model size."""
+    t_in = {i.name: i.shape for i in temp_sess.get_inputs()}
+    T_LAYERS = sum(1 for k in t_in if k.startswith("past_self_k."))
+    s = t_in["past_self_k.0"]   # ['B', window, heads, head_dim]
+    T_W, T_H, T_HD = int(s[1]), int(s[2]), int(s[3])
+    d_in = {i.name: i.shape for i in depth_sess.get_inputs()}
+    D_LAYERS = sum(1 for k in d_in if k.startswith("past_k."))
+    s = d_in["past_k.0"]        # ['B', window, heads, head_dim]
+    D_W, D_H, D_HD = int(s[1]), int(s[2]), int(s[3])
+    return T_LAYERS, T_W, T_H, T_HD, D_LAYERS, D_W, D_H, D_HD
+def _load_llm(size: str) -> dict:
+    if size in _llm_cache:
+        return _llm_cache[size]
+    base = f"{MODEL_PATH}/mrt2_{size}/onnx"
+    if not os.path.isdir(base):
+        raise FileNotFoundError(f"Model variant '{size}' not found at {base}")
+    print(f"Loading LLM sessions for mrt2_{size}...")
+    enc   = _sess(f"{base}/encoder.onnx")
+    temp  = _sess(f"{base}/temporal_step.onnx")
+    depth = _sess(f"{base}/depth_step.onnx")
+    embed = _sess(f"{base}/embed.onnx")
+    T_LAYERS, T_W, T_H, T_HD, D_LAYERS, D_W, D_H, D_HD = _infer_geometry(temp, depth)
+    print(f"  mrt2_{size}: T_LAYERS={T_LAYERS} T_W={T_W} T_H={T_H} T_HD={T_HD} "
+          f"D_LAYERS={D_LAYERS} D_W={D_W} D_H={D_H} D_HD={D_HD}")
+    for name, s in [("enc", enc), ("temporal", temp), ("depth", depth), ("embed", embed)]:
+        ins  = {i.name: i.shape for i in s.get_inputs()}
+        outs = {o.name: o.shape for o in s.get_outputs()}
+        print(f"[{name}] inputs: {ins}")
+        print(f"[{name}] outputs: {outs}")
+    result = dict(enc=enc, temp=temp, depth=depth, embed=embed,
+                  T_LAYERS=T_LAYERS, T_W=T_W, T_H=T_H, T_HD=T_HD,
+                  D_LAYERS=D_LAYERS, D_W=D_W, D_H=D_H, D_HD=D_HD)
+    _llm_cache[size] = result
+    return result
+# Pre-warm small at startup
+_load_llm("small")
+# Detect available sizes
+_SIZES_AVAILABLE = ["small"]
+if os.path.isdir(f"{MODEL_PATH}/mrt2_large"):
+    _SIZES_AVAILABLE.append("large")
+    print("Large model variant detected - will be available in UI")
+else:
+    print("No mrt2_large directory found - only small available")
 # ---------- helper: discretize CFG ----------
 def _disc_cfg(v: float, step: float, max_bin: int) -> int:
     c = max(-1.0, min(7.0, v))
 # ---------- conditioning vector ----------
 def build_cond(style_tokens: list, notes: list, cfg_mcc=CFG_MCC, cfg_notes=CFG_NOTES, cfg_drums=CFG_DRUMS) -> np.ndarray:
+    """Build 144-length cond vector, shifted by COND_OFFSET, shape [1,1,144] int64."""
     out = [0] * COND_LEN
     k = 0
     for i in range(NUM_CB):
     pad_mask = np.ones((1, 128), dtype=np.float32)
     pad_mask[0, :len(ids_raw)+1] = 0.0
     enc_inputs = text_enc_s.get_inputs()
     feed_enc = {}
     for inp in enc_inputs:
             feed_enc[inp.name] = ids
     emb = text_enc_s.run(None, feed_enc)[0]  # [1, 768]
     map_inputs = mapper_s.get_inputs()
     feed_map = {}
     for inp in map_inputs:
             feed_map[inp.name] = emb
     mapped = mapper_s.run(None, feed_map)[0]  # [1, 768]
     norm = np.linalg.norm(mapped)
     if norm > 1e-8:
         mapped = mapped / norm
+    style_tokens = vq_s.run(None, {vq_s.get_inputs()[0].name: mapped})[0]
     return style_tokens.reshape(-1).tolist()
 # ---------- sampling ----------
     n_seconds: float,
     temperature: float,
     cfg_mcc: float,
+    model_size: str,
     progress=gr.Progress(track_tqdm=True),
 ) -> tuple:
     import json
     n_frames = max(4, int(n_seconds * 25))
+    progress(0.0, desc=f"Loading {model_size} model sessions...")
+    m = _load_llm(model_size)
+    enc_s   = m["enc"]
+    temp_s  = m["temp"]
+    depth_s = m["depth"]
+    embed_s = m["embed"]
+    T_LAYERS, T_W, T_H, T_HD = m["T_LAYERS"], m["T_W"], m["T_H"], m["T_HD"]
+    D_LAYERS, D_W, D_H, D_HD = m["D_LAYERS"], m["D_W"], m["D_H"], m["D_HD"]
+    progress(0.02, desc="Encoding prompt...")
     style_tokens = encode_text(prompt)
     cond = build_cond(style_tokens, held_notes, cfg_mcc=cfg_mcc)
     enc_out_arr = enc_s.run(None, {enc_s.get_inputs()[0].name: cond})[0]
     psk = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
     psv = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
     pck = [np.zeros((1, T_W, T_H, T_HD), np.float32) for _ in range(T_LAYERS)]
     prev_codes = np.zeros((1, NUM_CB), dtype=np.int64)
     cache_pos = 0
     t_out_names = [o.name for o in temp_s.get_outputs()]
     d_out_names = [o.name for o in depth_s.get_outputs()]
     all_codec_frames = []
     for f in range(n_frames):
+        progress((f + 1) / (n_frames + 1), desc=f"Generating frame {f+1}/{n_frames} [{model_size}]...")
         feed_t = {
             "prev_codes": prev_codes,
             "enc_out":    enc_out_arr,
             feed_t[f"past_cross_v.{i}"] = pcv[i]
         t_out_dict = dict(zip(t_out_names, temp_s.run(None, feed_t)))
+        temporal_out = t_out_dict["temporal_out"]
         for i in range(T_LAYERS):
             psk[i] = t_out_dict[f"present_self_k.{i}"]
             psv[i] = t_out_dict[f"present_self_v.{i}"]
             pck[i] = t_out_dict[f"present_cross_k.{i}"]
             pcv[i] = t_out_dict[f"present_cross_v.{i}"]
         dk = [np.zeros((1, D_W, D_H, D_HD), np.float32) for _ in range(D_LAYERS)]
         dv = [np.zeros((1, D_W, D_H, D_HD), np.float32) for _ in range(D_LAYERS)]
+        depth_in = temporal_out
         unique_codes = []
         for level in range(NUM_CB):
                 feed_d[f"past_v.{i}"] = dv[i]
             d_out_dict = dict(zip(d_out_names, depth_s.run(None, feed_d)))
+            logits = d_out_dict["logits"]
             for i in range(D_LAYERS):
                 dk[i] = d_out_dict[f"present_k.{i}"]
                 dv[i] = d_out_dict[f"present_v.{i}"]
             if level < NUM_CB - 1:
                 e_out = embed_s.run(None, {"token": np.array([token], dtype=np.int64)})
+                depth_in = e_out[0]
         codec_frame = to_codec(unique_codes)
         all_codec_frames.append(codec_frame)
     if len(all_codec_frames) < 2:
         return (SAMPLE_RATE, np.zeros((FRAME_SAMPLES * 2, 2), dtype=np.float32))
     progress(0.98, desc="Decoding audio...")
     codes_arr = np.array(all_codec_frames, dtype=np.int32).reshape(1, len(all_codec_frames), NUM_CB)
     audio_raw = dec_s.run(None, {"codes": codes_arr})[0]  # [1, (T-1)*1920, 2]
+    audio = audio_raw.squeeze(0)
     audio = np.clip(audio, -1.0, 1.0).astype(np.float32)
     progress(1.0, desc="Done!")
     return (SAMPLE_RATE, audio)
     {midi:79,n:'G5'},{midi:81,n:'A5'},{midi:83,n:'B5'},
     {midi:84,n:'C6'}
   ];
   const BLACK_POSITIONS = {
     49:[0,0], 51:[1,0], 54:[3,0], 56:[4,0], 58:[5,0],
     61:[7,0], 63:[8,0], 66:[10,0], 68:[11,0], 70:[12,0],
   let held = new Set();
   const piano = document.getElementById('piano');
   WHITE_NOTES.forEach((wk, idx) => {
     const el = document.createElement('div');
     el.className = 'white-key';
     piano.appendChild(el);
   });
   const whiteKeys = piano.querySelectorAll('.white-key');
   Object.entries(BLACK_POSITIONS).forEach(([midi, [wIdx, _]]) => {
     if (wIdx >= whiteKeys.length) return;
     const el = document.createElement('div');
     el.className = 'black-key';
     el.dataset.midi = midi;
 </script>
 """
+def _generate_wrapper(prompt, notes_json, n_seconds, temperature, cfg_mcc, model_size, progress=gr.Progress()):
     if not prompt.strip():
         prompt = "smooth jazz piano"
+    return generate(prompt, notes_json, n_seconds, temperature, cfg_mcc, model_size, progress)
 with gr.Blocks(title="Magenta RT2 - Piano (CPU)") as demo:
     gr.HTML("""
                 value="smooth jazz piano, warm, relaxed",
                 lines=2
             )
+            model_size_dd = gr.Dropdown(
+                choices=_SIZES_AVAILABLE,
+                value="small",
+                label="Model size (large = slower but higher quality, loads on first use)",
+                interactive=len(_SIZES_AVAILABLE) > 1,
+            )
             n_seconds = gr.Slider(1, 20, value=5, step=1, label="Duration (seconds)")
             temperature = gr.Slider(0.1, 1.5, value=0.9, step=0.05, label="Temperature (creativity)")
             cfg_mcc = gr.Slider(0.0, 6.0, value=1.6, step=0.1, label="Style guidance strength")
       <b style="color:#aaa;">How to use:</b>
       Click piano keys to hold notes (click again to release) - they steer the melody.
       Type a style prompt, set duration, then hit Generate.
+      <br>CPU generation: ~1-3 min for 5s audio (small). No MIDI device needed.
+      Large model loads its sessions on first use (extra ~30s), then stays cached.
     </div>
     """)
     gen_btn.click(
         fn=_generate_wrapper,
+        inputs=[prompt_in, notes_state, n_seconds, temperature, cfg_mcc, model_size_dd],
         outputs=[audio_out],
     )