{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"},"kaggle":{"accelerator":"nvidiaTeslaT4","dataSources":[{"sourceId":13775859,"sourceType":"datasetVersion","datasetId":8768079},{"sourceId":13776460,"sourceType":"datasetVersion","datasetId":8768537}],"dockerImageVersionId":31193,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# ===========================\n# Cell 1 — Install packages\n# ===========================\n!python -m pip install --upgrade pip\n!python -m pip install --upgrade \\\n protobuf==4.23.4 \\\n transformers==4.53.3 \\\n peft==0.16.0 \\\n tokenizers==0.21.2 \\\n huggingface-hub==0.36.0 \\\n safetensors==0.5.3 \\\n accelerate==1.9.0 \\\n soundfile librosa pydub tqdm\n\n!pip install -q git+https://github.com/hubertsiuzdak/snac.git\n\nimport os\nimport math\nfrom typing import List, Dict, Any\n\nimport torch\nimport torchaudio\nimport pandas as pd\n\nprint(\"Torch:\", torch.__version__)\nprint(\"CUDA available:\", torch.cuda.is_available())\n","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 2 — CONFIG (edit if needed)\n# ===========================\n\n# 1) Combined CSV produced from BanglaSER + BANSpEmo transcription & tagging\n# Put it either in /kaggle/working or as an input dataset and adjust path.\nCOMBINED_CSV = \"/kaggle/input/bangla-emottional-texts-labelled/BanglaSER_BANSpEmo_combined_transcribed_tagged_combined.csv\"\n\n# 2) Audio roots (adjust to your Kaggle input folders)\n# Use `!ls /kaggle/input` in a scratch cell to confirm names.\nBANGLASER_ROOT = \"/kaggle/input/bangla-emotional-speech-dataset/BanglaSER Dataset/BanglaSER Dataset\"\nBANSP_ROOT = \"/kaggle/input/bangla-emotional-speech-dataset/BANSpEmo A Bangla Language Emotional Speech Recognition Dataset/BANSpEmo A Bangla Language Emotional Speech Recognition Dataset/BANSpEmo Dataset\"\n\n# 3) Outputs\nOUT_DATASET_PT = \"/kaggle/working/svara_snac_dataset.pt\"\nOUT_ADAPTER_DIR = \"/kaggle/working/svara_lora_adapter\"\n\n# 4) Models\nSVARA_MODEL_ID = \"kenpath/svara-tts-v1\"\nSNAC_MODEL_ID = \"hubertsiuzdak/snac_24khz\"\n\n# 5) Audio\nTARGET_SR = 24000\nDEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\nprint(\"Device:\", DEVICE)\n\nif not os.path.exists(COMBINED_CSV):\n raise FileNotFoundError(f\"Combined CSV not found: {COMBINED_CSV}\")\n\ndf = pd.read_csv(COMBINED_CSV)\nprint(\"Combined CSV rows:\", len(df))\nprint(\"Columns:\", df.columns.tolist())\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 3 — Svara token constants\n# (copied from Svara token module)\n# ===========================\n\nTOKENISER_LENGTH = 128256 # base vocab size\n\nSTART_OF_TEXT = 128000\nEND_OF_TEXT = 128001\nEOT_ID = 128009\n\nSTART_OF_SPEECH = TOKENISER_LENGTH + 1 # 128257\nEND_OF_SPEECH = TOKENISER_LENGTH + 2 # 128258\nSTART_OF_HUMAN = TOKENISER_LENGTH + 3 # 128259\nEND_OF_HUMAN = TOKENISER_LENGTH + 4 # 128260\nSTART_OF_AI = TOKENISER_LENGTH + 5 # 128261\nEND_OF_AI = TOKENISER_LENGTH + 6 # 128262\nPAD_TOKEN = TOKENISER_LENGTH + 7 # 128263\n\nAUDIO_TOKENS_START = TOKENISER_LENGTH + 10 # 128266\nAUDIO_VOCAB_SIZE = 4096\n\nAUDIO_TOKEN_OFFSETS = [\n AUDIO_TOKENS_START + (0 * AUDIO_VOCAB_SIZE), # 128266 - level0\n AUDIO_TOKENS_START + (1 * AUDIO_VOCAB_SIZE), # 132362 - level1 even\n AUDIO_TOKENS_START + (2 * AUDIO_VOCAB_SIZE), # 136458 - level2 stream 0\n AUDIO_TOKENS_START + (3 * AUDIO_VOCAB_SIZE), # 140554 - level2 stream 1\n AUDIO_TOKENS_START + (4 * AUDIO_VOCAB_SIZE), # 144650 - level1 odd\n AUDIO_TOKENS_START + (5 * AUDIO_VOCAB_SIZE), # 148746 - level2 stream 2\n AUDIO_TOKENS_START + (6 * AUDIO_VOCAB_SIZE), # 152842 - level2 stream 3\n]\n\nBEGIN_OF_TEXT_STR = \"<|begin_of_text|>\"\nEND_OF_TEXT_STR = \"<|end_of_text|>\"\nEOT_ID_STR = \"<|eot_id|>\"\nAUDIO_STR = \"<|audio|>\"\n\nprint(\"AUDIO_TOKENS_START:\", AUDIO_TOKENS_START)\nprint(\"AUDIO_TOKEN_OFFSETS:\", AUDIO_TOKEN_OFFSETS)\nprint(\"AUDIO_STR:\", AUDIO_STR)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 4 — Load Svara tokenizer + SNAC codec\n# ===========================\n\nfrom transformers import AutoTokenizer\nfrom snac import SNAC\n\nprint(\"Loading Svara tokenizer...\")\ntokenizer = AutoTokenizer.from_pretrained(\n SVARA_MODEL_ID,\n use_fast=False,\n trust_remote_code=True\n)\nprint(\"Tokenizer vocab size:\", tokenizer.vocab_size)\n\naudio_start_id = tokenizer.convert_tokens_to_ids(AUDIO_STR)\nprint(f\"AUDIO_STR '{AUDIO_STR}' -> id {audio_start_id}\")\n\nprint(\"Loading SNAC 24kHz codec...\")\nsnac_model = SNAC.from_pretrained(SNAC_MODEL_ID).to(DEVICE).eval()\nprint(\"SNAC loaded.\")\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 5 — Helper functions (audio → SNAC → Svara tokens)\n# ===========================\n\nfrom typing import Tuple\nfrom tqdm.auto import tqdm\n\nresampler_cache: Dict[int, torchaudio.transforms.Resample] = {}\n\ndef load_wav_24k_mono(path: str, target_sr: int = TARGET_SR) -> torch.Tensor:\n wav, sr = torchaudio.load(path) # [C, T]\n if wav.shape[0] > 1:\n wav = wav.mean(dim=0, keepdim=True)\n if sr != target_sr:\n if sr not in resampler_cache:\n resampler_cache[sr] = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)\n wav = resampler_cache[sr](wav)\n wav = wav / max(1e-9, wav.abs().max())\n return wav # [1, T]\n\n@torch.no_grad()\ndef encode_snac_codes(wav_24k: torch.Tensor) -> List[torch.Tensor]:\n audio = wav_24k.unsqueeze(0).to(DEVICE) # [B=1, 1, T]\n _, codes = snac_model(audio) # list[Tensor[B, T_level]]\n return [c[0].to(\"cpu\") for c in codes] # [T0], [T1], [T2]\n\ndef get_audio_path(row: pd.Series) -> str:\n ds = str(row[\"dataset\"])\n fname = str(row[\"file\"])\n\n if ds == \"BanglaSER\":\n folder_label = str(row.get(\"folder_label\", \"\") or \"\")\n if os.path.sep in fname:\n return os.path.join(BANGLASER_ROOT, fname)\n elif folder_label:\n return os.path.join(BANGLASER_ROOT, folder_label, fname)\n else:\n return os.path.join(BANGLASER_ROOT, fname)\n elif ds == \"BANSpEmo\":\n return os.path.join(BANSP_ROOT, fname)\n else:\n raise ValueError(f\"Unknown dataset: {ds}\")\n\ndef pack_svara_audio_tokens(snac_codes: List[torch.Tensor]) -> List[int]:\n \"\"\"\n Svara packing:\n snac_codes[0] -> level0 [T]\n snac_codes[1] -> level1 [2T]\n snac_codes[2] -> level2 [4T]\n \"\"\"\n level0 = snac_codes[0]\n level1 = snac_codes[1]\n level2 = snac_codes[2]\n\n level1_even = level1[0::2]\n level1_odd = level1[1::2]\n\n level2_0 = level2[0::4]\n level2_1 = level2[1::4]\n level2_2 = level2[2::4]\n level2_3 = level2[3::4]\n\n T = len(level0)\n assert all(len(x) == T for x in [level1_even, level1_odd, level2_0, level2_1, level2_2, level2_3]), \\\n \"SNAC level lengths mismatch. Check SNAC version.\"\n\n audio_ids: List[int] = []\n for i in range(T):\n audio_ids.append(AUDIO_TOKEN_OFFSETS[0] + int(level0[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[1] + int(level1_even[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[2] + int(level2_0[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[3] + int(level2_1[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[4] + int(level1_odd[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[5] + int(level2_2[i]))\n audio_ids.append(AUDIO_TOKEN_OFFSETS[6] + int(level2_3[i]))\n return audio_ids\n\ndef build_input_and_labels(text: str, audio_ids: List[int]) -> Dict[str, List[int]]:\n \"\"\"\n input_ids = [text_ids] + [<|audio|>] + [audio_ids]\n labels = [-100 for text_ids + <|audio|>] + [audio_ids]\n \"\"\"\n text_ids = tokenizer.encode(text, add_special_tokens=True)\n seq = text_ids.copy()\n seq.append(audio_start_id)\n audio_prefix_len = len(seq)\n seq.extend(audio_ids)\n\n input_ids = seq\n labels = [-100] * audio_prefix_len + audio_ids\n assert len(input_ids) == len(labels)\n return {\"input_ids\": input_ids, \"labels\": labels}\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 6 — Build svara_snac_dataset.pt\n# ===========================\n\nexamples: List[Dict[str, Any]] = []\n\nfor idx, row in tqdm(df.iterrows(), total=len(df), desc=\"Building SNAC dataset\"):\n text = str(row[\"transcript\"]) # already includes ///\n\n # 1) WAV path\n try:\n wav_path = get_audio_path(row)\n except Exception as e:\n print(f\"[Row {idx}] path error: {e}\")\n continue\n\n if not os.path.exists(wav_path):\n print(f\"[Row {idx}] missing audio:\", wav_path)\n continue\n\n # 2) SNAC encoding\n try:\n wav_24k = load_wav_24k_mono(wav_path)\n snac_codes = encode_snac_codes(wav_24k)\n audio_ids = pack_svara_audio_tokens(snac_codes)\n except Exception as e:\n print(f\"[Row {idx}] SNAC error on {wav_path}: {e}\")\n continue\n\n # 3) Build LM sequence\n seq = build_input_and_labels(text, audio_ids)\n\n examples.append({\n \"dataset\": str(row[\"dataset\"]),\n \"file\": str(row[\"file\"]),\n \"emotion_norm\": str(row.get(\"emotion_norm\", \"\")),\n \"tag\": str(row.get(\"tag\", \"\")),\n \"text\": text,\n \"input_ids\": seq[\"input_ids\"],\n \"labels\": seq[\"labels\"],\n })\n\nprint(\"Total examples:\", len(examples))\ntorch.save(examples, OUT_DATASET_PT)\nprint(\"Saved dataset to:\", OUT_DATASET_PT)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 7 — Sanity check & Dataset wrapper\n# ===========================\n\nloaded = torch.load(OUT_DATASET_PT)\nprint(\"Loaded examples:\", len(loaded))\n\nif loaded:\n ex0 = loaded[0]\n print(\"Example 0 keys:\", ex0.keys())\n print(\"Dataset:\", ex0[\"dataset\"], \"File:\", ex0[\"file\"])\n print(\"Emotion:\", ex0[\"emotion_norm\"], ex0[\"tag\"])\n print(\"Text snippet:\", ex0[\"text\"][:120], \"...\")\n print(\"input_ids len:\", len(ex0[\"input_ids\"]), \"labels len:\", len(ex0[\"labels\"]))\n first_audio_idx = next((i for i, v in enumerate(ex0[\"labels\"]) if v != -100), None)\n print(\"First audio label idx:\", first_audio_idx)\n\nfrom torch.utils.data import Dataset, random_split\n\nclass SvaraSnacDataset(Dataset):\n def __init__(self, items: List[Dict[str, Any]]):\n self.items = items\n def __len__(self): return len(self.items)\n def __getitem__(self, idx):\n ex = self.items[idx]\n return {\"input_ids\": ex[\"input_ids\"], \"labels\": ex[\"labels\"]}\n\nfull_dataset = SvaraSnacDataset(loaded)\nprint(\"Full dataset size:\", len(full_dataset))\n\ntrain_size = int(0.9 * len(full_dataset))\nvalid_size = len(full_dataset) - train_size\ntrain_ds, valid_ds = random_split(full_dataset, [train_size, valid_size])\nprint(\"Train size:\", len(train_ds), \"Valid size:\", len(valid_ds))\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 8 — Collator (pad input_ids & labels)\n# ===========================\n\ndef svara_collate_fn(batch):\n max_len = max(len(ex[\"input_ids\"]) for ex in batch)\n pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0\n\n input_ids = []\n attention_masks = []\n labels = []\n\n for ex in batch:\n ids = ex[\"input_ids\"]\n lbl = ex[\"labels\"]\n pad_len = max_len - len(ids)\n\n input_ids.append(ids + [pad_id] * pad_len)\n attention_masks.append([1] * len(ids) + [0] * pad_len)\n labels.append(lbl + [-100] * pad_len)\n\n return {\n \"input_ids\": torch.tensor(input_ids, dtype=torch.long),\n \"attention_mask\": torch.tensor(attention_masks, dtype=torch.long),\n \"labels\": torch.tensor(labels, dtype=torch.long),\n }\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 9 — Load Svara base model + LoRA\n# ===========================\n\nfrom transformers import AutoModelForCausalLM, TrainingArguments, Trainer\nfrom peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n\nos.makedirs(OUT_ADAPTER_DIR, exist_ok=True)\nos.environ[\"WANDB_DISABLED\"] = \"true\"\n\nprint(\"Loading Svara base model...\")\nmodel = AutoModelForCausalLM.from_pretrained(\n SVARA_MODEL_ID,\n trust_remote_code=True,\n device_map=\"auto\"\n)\nprint(\"Model loaded.\")\n\nmodel = prepare_model_for_kbit_training(model)\n\ntarget_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"wq\", \"wk\", \"wv\", \"wo\", \"proj\"]\nlora_conf = LoraConfig(\n r=8,\n lora_alpha=32,\n target_modules=target_modules,\n lora_dropout=0.1,\n bias=\"none\",\n task_type=\"CAUSAL_LM\",\n)\n\nmodel = get_peft_model(model, lora_conf)\n\ntrainable = sum(p.numel() for p in model.parameters() if p.requires_grad)\ntotal = sum(p.numel() for p in model.parameters())\nprint(f\"Trainable params: {trainable} / {total}\")\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 10 — Trainer & training\n# ===========================\n\nEPOCHS = 3\nPER_DEVICE_BATCH = 1\nGRAD_ACC = 4\nLR = 3e-4\n\ntraining_args = TrainingArguments(\n output_dir=OUT_ADAPTER_DIR,\n num_train_epochs=EPOCHS,\n per_device_train_batch_size=PER_DEVICE_BATCH,\n gradient_accumulation_steps=GRAD_ACC,\n learning_rate=LR,\n fp16=True,\n logging_steps=50,\n save_steps=500,\n save_total_limit=3,\n eval_strategy=\"steps\",\n eval_steps=500,\n remove_unused_columns=False,\n push_to_hub=False,\n report_to=\"none\",\n)\n\ntrainer = Trainer(\n model=model,\n args=training_args,\n train_dataset=train_ds,\n eval_dataset=valid_ds,\n data_collator=svara_collate_fn,\n)\n\nprint(\"Starting training...\")\ntrainer.train()\nprint(\"Training finished.\")\n","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"# ===========================\n# Cell 11 — Save LoRA adapter + tokenizer\n# ===========================\n\nmodel.save_pretrained(OUT_ADAPTER_DIR)\ntokenizer.save_pretrained(OUT_ADAPTER_DIR)\nprint(\"Saved LoRA adapter + tokenizer to:\", OUT_ADAPTER_DIR)\n","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}