{ "model": "Qwen/Qwen3-8B", "quantization": "4bit", "dataset": "allenai/c4", "dataset_config": "en", "dataset_split": "train", "text_field": "text", "seq_len": 2048, "train_tokens": 15000000, "val_tokens": 1000000, "sparse_layers": [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35 ], "top_k": 2048, "query_samples": 32, "tail_query_samples": 8, "indexer_heads": 6, "proj_dim": 69, "rope_dim": 64, "indexer_params_per_layer": 2003427, "indexer_params": 72123372, "target_indexer_params": 0, "target_indexer_params_per_layer": 2000000, "target_indexer_params_per_layer_effective": 2000000, "support_loss_weight": 0.1, "support_top_k": 2048, "loss_agg": "per-layer", "layer_weights": null, "lr_schedule": "warmup-cosine", "warmup_frac": 0.05, "min_lr": 1e-06, "model_type": "qwen3", "hidden_size": 4096, "n_layers": 36, "max_position_embeddings": 40960, "metadata": { "model_type": "qwen3", "hidden_size": 4096, "n_layers": 36, "max_position_embeddings": 40960 } }