{ "model": { "d_model": 512, "d_ff": 2048, "n_layers": 6, "dropout": 0.1, "max_src_len": 512, "max_tgt_len": 256, "src_vocab_size": 32000, "tgt_vocab_size": 32000 }, "attention": { "name": "sliding_gqa", "n_heads": 8, "n_kv_heads": 2, "window_size": 128, "bias": false }, "feedforward": { "name": "relu_ffn", "d_ff": 2048 }, "normalization": { "name": "layernorm" }, "positional": { "name": "sinusoidal", "dropout": 0.1 }, "connection": { "name": "residual", "dropout": 0.1, "norm": "layernorm" }, "optimizer": { "name": "adamw" }, "scheduler": { "name": "none" }, "loss": { "name": "cross_entropy", "label_smoothing": 0.1 }, "data": { "name": "meetingbank", "hf_path": "huuuyeah/meetingbank", "tokenizer_dir": "tokenizers", "tokenizer_basename": "meetingbank", "vocab_size": 32000, "max_src_len": 512, "max_tgt_len": 256, "val_batch_size": 1, "num_workers": 0, "limit": 0 }, "logging": { "backend": "tensorboard" }, "seed": 42, "experiment_name": "run_sliding_gqa", "training": { "num_epochs": 20, "batch_size": 8, "lr": 0.0001, "weight_decay": 0.01, "grad_clip": 0.0, "ckpt_dir": "weights", "ckpt_basename": "meeting_model", "preload": null, "tui": true, "save_every_n_epochs": 4, "hf": { "push": false, "repo_id": null, "private": false, "commit_message": null } } }