{ "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "act_layer_combinations": [ [ 21, 22, 23, 24, 25 ] ], "schema_version": 1, "special_token": " ?", "prefix_template": "Layer: {layer}\\n{special_token} * {num_positions} \\n", "model_name": "Qwen/Qwen3-8B", "hook_onto_layer": 1, "dataset_configs": [ { "custom_dataset_params": { "hf_dataset_repo": "ceselder/cot-oracle-convqa-chunked-sonnet", "hf_split": "train", "cot_prefix_field": "cot_prefix", "stochastic_max_k": 100, "max_cot_prefix_tokens": 2048, "target_field": "target_response" }, "num_train": 280000, "num_test": 0, "splits": [ "train" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "cot_oracle_convqa", "dataset_folder": "sft_training_data", "seed": 44 }, { "custom_dataset_params": { "classification_dataset_name": "geometry_of_truth", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_geometry_of_truth", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "geometry_of_truth", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_geometry_of_truth", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "relations", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_relations", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "relations", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_relations", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "sst2", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_sst2", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "sst2", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_sst2", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "md_gender", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_md_gender", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "md_gender", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_md_gender", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "snli", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_snli", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "snli", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_snli", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "ag_news", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_ag_news", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "ag_news", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_ag_news", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "ner", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_ner", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "ner", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_ner", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "tense", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_tense", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "tense", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 1500, "num_test": 250, "splits": [ "train", "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_tense", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "language_identification", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 4, "dataset_name": "classification_language_identification", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "language_identification", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_language_identification", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "singular_plural", "num_qa_per_sample": 6, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 1, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_singular_plural", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "classification_dataset_name": "singular_plural", "num_qa_per_sample": 3, "min_end_offset": -1, "max_end_offset": -5, "max_window_size": 50, "min_window_size": 1 }, "num_train": 0, "num_test": 250, "splits": [ "test" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "classification_singular_plural", "dataset_folder": "sft_training_data", "seed": 42 }, { "custom_dataset_params": { "min_k_tokens": 1, "max_k_tokens": 50, "min_k_activations": 1, "max_k_activations": 1, "max_length": 2000, "directions": [ "past", "future" ], "vllm_max_new_tokens": 200, "max_vllm_context_tokens": 2000, "future_chat_system_prompt_prob": 0.0, "system_prompt_path": "data_pipelines/latentqa_datasets/train/system.json", "english_only_temp_filter": false, "pretrain_dataset": "ceselder/cot-oracle-corpus-v5", "pretrain_key": "cot_response", "pretrain_split": "train", "pretrain_frac": 1.0, "future_corpus_only": true, "past_use_vllm": false }, "num_train": 1100000, "num_test": 0, "splits": [ "train" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 59, 62, 64, 67, 70 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "past_lens", "dataset_folder": "sft_training_data", "seed": 42 } ], "dataset_loader_names": [ "cot_oracle_convqa", "classification_geometry_of_truth", "classification_geometry_of_truth", "classification_relations", "classification_relations", "classification_sst2", "classification_sst2", "classification_md_gender", "classification_md_gender", "classification_snli", "classification_snli", "classification_ag_news", "classification_ag_news", "classification_ner", "classification_ner", "classification_tense", "classification_tense", "classification_language_identification", "classification_language_identification", "classification_singular_plural", "classification_singular_plural", "past_lens" ], "validation_dataset_configs": [ { "custom_dataset_params": { "hf_dataset_repo": "cds-jb/cot-oracle-convqa-chunked-haiku", "hf_split": "test", "cot_prefix_field": "cot_prefix", "stochastic_max_k": 100, "max_cot_prefix_tokens": 2048, "target_field": "target_response" }, "num_train": 25, "num_test": 0, "splits": [ "train" ], "model_name": "Qwen/Qwen3-8B", "layer_combinations": [ [ 62 ] ], "save_acts": false, "batch_size": 16, "dataset_name": "cot_oracle_convqa", "dataset_folder": "sft_training_data", "seed": 45 } ], "validation_dataset_loader_names": [ "cot_oracle_convqa" ], "use_decoder_vectors": true, "generation_kwargs": { "do_sample": false, "max_new_tokens": 64 }, "steering_coefficient": 1.0, "dataset_folder": "sft_training_data", "chat_regularization_path": null, "chat_regularization_every_n_ao_updates": null, "chat_regularization_weight": 1.0, "chat_regularization_max_train_examples": null, "monitor_num_eval_examples_per_component": null, "monitor_num_eval_examples_classification_total": null, "monitor_eval_steps": null, "monitor_eval_on_start": false, "validation_steps": 500, "validation_on_start": false, "train_batch_size": 16, "eval_batch_size": 64, "train_batches_per_materialization_block": 16, "use_lora": true, "lora_r": 128, "lora_alpha": 16, "lora_dropout": 0.0, "lora_target_modules": "all-linear", "use_rslora": true, "max_target_tokens": 50000000, "use_unsloth": true, "unsloth_max_seq_length": 4096, "num_epochs": 1, "lr": 3e-05, "gradient_accumulation_steps": 1, "max_grad_norm": 1.0, "eval_steps": 999999, "eval_on_start": false, "gradient_checkpointing": false, "window_mult": 20, "save_steps": 999999, "save_dir": "/workspace/checkpoints/ao_q3_8b_v3_multi5_sonnet_norm2p0", "max_train_examples": 2500000, "seed": 42, "eval_logs_path": "eval_logs.json", "load_lora_path": null, "created_at_utc": "", "git_commit": "", "wandb_project": "ao-v3-clean-ablations", "wandb_run_name": "ao_q3_8b_v3_multi5_sonnet_norm2p0", "wandb_suffix": "_v3_multi5_sonnet_norm2p0", "examples_per_source_epoch": 26669, "hf_push_to_hub": false, "hf_private_repo": false, "hf_repo_name": "", "hf_repo_id": "", "load_in_8bit": false, "open_ended_eval_include": null, "open_ended_eval_max_entries": 7, "positive_negative_examples": false }