model: d_model: 512 d_ff: 2048 n_layers: 6 dropout: 0.1 max_src_len: 512 max_tgt_len: 256 src_vocab_size: 32000 tgt_vocab_size: 32000 attention: name: sliding_gqa n_heads: 8 n_kv_heads: 2 window_size: 128 bias: false feedforward: name: relu_ffn d_ff: 2048 normalization: name: layernorm positional: name: sinusoidal dropout: 0.1 connection: name: residual dropout: 0.1 norm: layernorm optimizer: name: adamw scheduler: name: none loss: name: cross_entropy label_smoothing: 0.1 data: name: meetingbank hf_path: huuuyeah/meetingbank tokenizer_dir: tokenizers tokenizer_basename: meetingbank vocab_size: 32000 max_src_len: 512 max_tgt_len: 256 val_batch_size: 1 num_workers: 0 limit: 0 logging: backend: tensorboard seed: 42 experiment_name: run_sliding_gqa training: num_epochs: 20 batch_size: 8 lr: 0.0001 weight_decay: 0.01 grad_clip: 0.0 ckpt_dir: weights ckpt_basename: meeting_model preload: null tui: true save_every_n_epochs: 4 hf: push: false repo_id: null private: false commit_message: null